# Importing Libraries

In [45]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
import warnings
import re

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps=PorterStemmer()
wordnet= WordNetLemmatizer()
sw=stopwords.words('english')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Importing Data

In [46]:
data=pd.read_csv('Twitter Sentiments.csv')

In [47]:
data.sample(20)

Unnamed: 0,id,label,tweet
4765,4766,0,just pre-ordered my copy! #harrypotterandthe...
8349,8350,0,such sad news about jo cox. what is this world...
19255,19256,0,do you dare to fight me?ð¡ hahahaha #puppy #...
381,382,0,@user #tagsforlikes #me #food#friends#summer#...
14592,14593,0,@user on our way to the #cmtredcarpet â¨ #...
21037,21038,0,#model i love u take with u all the time in ...
26641,26642,0,this is what happens when a race feels superio...
7138,7139,0,@user father's day to all the fathers world...
22191,22192,0,@user registered for #biggboss10! waiting for ...
4365,4366,0,bored #cntfindfriends #snap


# EDA

#####  Removing usernames i.e pattern @user

In [48]:
data['tweet']=data['tweet'].apply(lambda x: re.sub("@\S+","",x))

##### Removing special characters,numbers and punctuations and replacing with space

In [49]:
data['tweet']=data['tweet'].apply(lambda x: re.sub("[^a-zA-Z#]"," ",x))
data.sample(3)

Unnamed: 0,id,label,tweet
27204,27205,0,you can t help the damsel if she loves her mis...
30480,30481,0,#pray for #orlando so this continues to hap...
12050,12051,0,#early bull up you will dominate your bull ...


# Preprocessing of Tweets

### 1) lower case 2) Removing Punctuations 3) Removing Stopwords-SW 4)Word Lemmatizaion

In [50]:
data['tweet']=data['tweet'].apply(lambda x:x.lower())
data['tweet']=data['tweet'].apply(lambda x : x.translate(str.maketrans(dict.fromkeys(string.punctuation))))
data['tweet']=data['tweet'].apply(lambda x : " ".join([word for word in x.split() if word not in (sw)]))
data['tweet']=data['tweet'].apply(lambda x :" ".join([wordnet.lemmatize(word) for word in x.split()]))

In [51]:
data.head(15)

Unnamed: 0,id,label,tweet
0,1,0,father dysfunctional selfish drag kid dysfunct...
1,2,0,thanks lyft credit use cause offer wheelchair ...
2,3,0,bihday majesty
3,4,0,model love u take u time ur
4,5,0,factsguide society motivation
5,6,0,huge fan fare big talking leave chaos pay disp...
6,7,0,camping tomorrow danny
7,8,0,next school year year exam think school exam h...
8,9,0,love land allin cavs champion cleveland clevel...
9,10,0,welcome gr


### Label describes positive and negative impact of tweets i.e. 1 for offenssive comment and 0 for General Comment

In [52]:
data['label'].sample(25)

10935    0
8425     0
29157    0
2341     0
31282    0
10304    0
1240     0
10716    0
31718    1
626      0
8521     0
6444     0
24205    0
5991     0
21344    1
371      0
25108    0
17363    0
10793    0
18671    0
24722    0
14196    0
29008    0
6986     0
19523    0
Name: label, dtype: int64

In [53]:
x=data['tweet']
y=data['label']

# Train-Test Split

In [54]:
X_train , X_test, y_train, y_test = train_test_split(x,y, test_size = 0.2 , random_state = 101)

In [55]:
train_data = pd.DataFrame({'tweet':X_train , 'tweet_category':y_train})
test_data = pd.DataFrame({'tweet':X_test , 'tweet_category':y_test})

In [56]:
train_data.head(3)

Unnamed: 0,tweet,tweet_category
22711,wait till green leaf come gonna nail bitter sure,0
2688,model love u take u time ur,0
1120,lol watched novela parent,0


In [57]:
test_data.tail(5)

Unnamed: 0,tweet,tweet_category
9900,increasingly angered leave campaign stupidity ...,0
10836,yes going july,0
22075,missing paner crime return pa sunday lucky spe...,0
31468,jts troxy theater jts love sinkthepink music i...,0
17609,finished book thinking forget past hard think ...,0


### training the train data

In [58]:
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['tweet'])
test_vectors = vectorizer.transform(test_data['tweet'])

### resampling to handle imbalance data 

In [59]:
# check version number
import imblearn
# print(imblearn.__version__)
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=5)
# transform the dataset
train_vectors, y_train = oversample.fit_resample(train_vectors, y_train)

In [64]:
model=LogisticRegression()
model.fit(train_vectors,y_train)

LogisticRegression()

### Prediction

In [61]:
pred = model.predict(test_vectors)
pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [62]:
report = classification_report(test_data['tweet_category'] , pred , output_dict=True)
print(f"Offensive {report['1']['recall']}")
print(f"Non-Offensive {report['0']['recall']}")
abc=pd.DataFrame(report)
abc

Offensive 0.8171296296296297
Non-Offensive 0.9275289380976346


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.985913,0.449682,0.920069,0.717797,0.949678
recall,0.927529,0.81713,0.920069,0.872329,0.920069
f1-score,0.95583,0.580115,0.920069,0.767973,0.930442
support,5961.0,432.0,0.920069,6393.0,6393.0


###### random test of predictions

In [63]:
a  = input('write the tweet: ')
vector = vectorizer.transform([a]).toarray()
xyz = model.predict(vector)
print(xyz)

write the tweet: hey beautiful
[0]


## Saving the model

In [67]:
import joblib
model_filename='twitter_Analysis_model.pkl'
vector_filename='twitter_Analysis_vector.pkl'
joblib.dump(model,model_filename)
joblib.dump(vectorizer,vector_filename)

['twitter_Analysis_vector.pkl']