# Importing dependencies

In [29]:
import numpy as np
import pandas as pd
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rahulwankhade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data preprocessing

In [3]:
columns_names = ['target','id','date','flag','user','text']

In [4]:
# Loading data into pandas dataframe
twitter_data = pd.read_csv('/Users/rahulwankhade/Downloads/TweeterData.csv',names= columns_names,encoding = 'ISO-8859-1')

In [5]:
twitter_data.shape

(1600000, 6)

In [6]:
 twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
# checking for missing values
twitter_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [8]:
#checking for data imbalance
twitter_data['target'].value_counts()

0    800000
4    800000
Name: target, dtype: int64

In [9]:
# convert the target "4" to "1"
twitter_data.replace({'target':{4:1}}, inplace=True)

In [10]:
#checking for data imbalance
twitter_data['target'].value_counts()

0    800000
1    800000
Name: target, dtype: int64

# Stemming

In [11]:
port_stem = PorterStemmer()

In [12]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [13]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [14]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [15]:
# seperating variables
x = twitter_data['stemmed_content'].values
y = twitter_data['target'].values

# splitting the data

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,stratify=y,random_state= 2)

In [17]:
# converting the text to numerical data

vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

# Logistic Regression

In [18]:
model = LogisticRegression(max_iter=1000)

In [19]:
model.fit(x_train,y_train)

In [20]:
# Accuracy for training data
y_pred_train = model.predict(x_train)

print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.82      0.79      0.81    640000
           1       0.80      0.83      0.81    640000

    accuracy                           0.81   1280000
   macro avg       0.81      0.81      0.81   1280000
weighted avg       0.81      0.81      0.81   1280000



In [21]:
# Accuracy for test data
y_pred = model.predict(x_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.76      0.77    160000
           1       0.77      0.80      0.78    160000

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000



* since difference between the accuracies of test and train dataset is not high the model shows low overfitting. The Accuracy score was 0.78. Let's try different models for higher accuracy.

# Naive Bayes

In [22]:
clf = Pipeline([
    ('NaiveBayes', MultinomialNB())
])

In [23]:
clf.fit(x_train, y_train)

In [24]:
# Accuracy for training data
y_pred_train = clf.predict(x_train)

print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82    640000
           1       0.84      0.80      0.82    640000

    accuracy                           0.82   1280000
   macro avg       0.82      0.82      0.82   1280000
weighted avg       0.82      0.82      0.82   1280000



In [25]:
# Accuracy for test data
y_pred = clf.predict(x_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.74      0.78      0.76    160000
           1       0.77      0.73      0.75    160000

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000



In [27]:
param_grid = {
    'NaiveBayes__alpha': [0.1, 0.5, 1.0] 
}

In [28]:
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

NameError: name 'GridSearchCV' is not defined

In [31]:
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

In [32]:
grid_search.best_params_

{'NaiveBayes__alpha': 1.0}

In [33]:
(grid_search.best_score_)

0.7548984375

* Since we got a higher accuracy with logistic regression, we will save that trained model

# saving the logistic regression model

In [34]:
import pickle

In [35]:
filename = 'trained_model.sav'

In [36]:
pickle.dump(model, open(filename,'wb'))

# loading the saved model

In [37]:
loaded_model = pickle.load(open('/Users/rahulwankhade/trained_model.sav', 'rb'))

In [39]:
x_new = x_test[3]
print(y_test[3])


prediction = loaded_model.predict(x_new)
print (prediction)
if (prediction[0] == 0):
    print('Negative Tweet')
else:
    print('Positive Tweet')

0
[0]
Negative Tweet
