In [15]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

## Data Processing

In [27]:
#loading the dataframe
data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [28]:
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [29]:
data.shape

(1599999, 6)

In [30]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\satba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
#stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [32]:
column_names = ['target','id','date','flag','user','text']
data = pd.read_csv('training.1600000.processed.noemoticon.csv', names=column_names, encoding='ISO-8859-1')

In [34]:
data.head()
data.shape

(1600000, 6)

In [35]:
#looking for missing values
data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [36]:
#look for the distribution of the target variable 
data['target'].value_counts()

0    800000
4    800000
Name: target, dtype: int64

In [40]:
#let's change the label '4' in target variable to '1'
data.replace({'target':{4:1}}, inplace = True)
data['target'].value_counts()

0    800000
1    800000
Name: target, dtype: int64

0 --> Negative tweet

1 --> Positive tweet

## Stemming

#### Stemming is the process of reducing a word to its root word

example : actor, actress, acting = act

In [41]:
port_stem=PorterStemmer()

In [44]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    
    return stemmed_content
    
    

In [45]:
data['stemmed_content']=data['text'].apply(stemming)

In [46]:
data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [47]:
print(data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [48]:
#separating the data and label (X and Y)
X=data['stemmed_content'].values
Y=data['target'].values

## Splitting into train and test sets

In [49]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, stratify=Y, random_state=42)
print(X.shape, X_train.shape, X_test.shape)

(1600000,) (1200000,) (400000,)


In [50]:
#converting the textual data to numerical data
vectorizer = TfidfVectorizer() ##assigns importance weights to each word

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


In [51]:
print(X_train)

  (0, 329087)	0.37114576516137004
  (0, 270080)	0.2848257507626687
  (0, 164224)	0.1291619965500788
  (0, 143033)	0.3295201214280994
  (0, 65243)	0.26504902007898723
  (0, 41850)	0.2264141582743051
  (0, 300489)	0.3126861227355235
  (0, 401416)	0.36853694309442253
  (0, 401353)	0.20061261350643886
  (0, 63215)	0.3321445834551768
  (0, 90000)	0.11726770082140679
  (0, 251138)	0.2982973449998188
  (0, 143703)	0.10985655153238971
  (0, 433450)	0.18811010184691346
  (1, 221730)	0.5157765002792987
  (1, 361121)	0.1844917502868393
  (1, 221002)	0.2068967643905159
  (1, 104211)	0.47616441717438895
  (1, 287644)	0.4054296169461297
  (1, 200734)	0.5157765002792987
  (2, 119514)	0.6678141351651221
  (2, 14341)	0.47863517442392484
  (2, 231879)	0.41663992718195264
  (2, 90000)	0.3890293327702376
  (3, 222902)	1.0
  :	:
  (1199997, 129946)	0.2871361202598447
  (1199998, 215911)	0.35105963277342433
  (1199998, 47042)	0.2494198196486021
  (1199998, 68166)	0.260820305220176
  (1199998, 31613)	0.55102

In [52]:
print(X_test)

  (0, 427150)	0.17840365810438702
  (0, 377117)	0.3779572929145396
  (0, 220976)	0.28457310117091716
  (0, 218544)	0.2704345128097195
  (0, 163352)	0.24226464084914848
  (0, 145601)	0.19352737644815407
  (0, 143703)	0.15858001545784028
  (0, 70744)	0.596444985071805
  (0, 68249)	0.3568119500564812
  (0, 18738)	0.2585531531722762
  (1, 427150)	0.4789337634078441
  (1, 391890)	0.7677162141132656
  (1, 143703)	0.4257161787571548
  (2, 368576)	0.2566036727054914
  (2, 359134)	0.521897339240824
  (2, 304836)	0.3016452701775643
  (2, 300177)	0.46260834929303674
  (2, 263584)	0.3712705267062053
  (2, 153013)	0.2826536606096508
  (2, 119660)	0.30935977029063816
  (2, 90000)	0.20818982986750875
  (3, 418581)	0.29103512560728023
  (3, 181169)	0.5616253930087194
  (3, 155453)	0.4201167312666686
  (3, 104052)	0.5184182962880249
  :	:
  (399996, 8918)	0.40664324880333724
  (399997, 90000)	0.4446766944249646
  (399997, 78812)	0.8956911506960905
  (399998, 434043)	0.46050163002695244
  (399998, 42055

## Training the ML model - Logistic Regression

In [53]:
model = LogisticRegression(max_iter=1000)

In [54]:
model.fit(X_train, Y_train)

LogisticRegression(max_iter=1000)

## Model Evaluation

Accuracy Score

In [56]:
X_train_pred=model.predict(X_train)
training_accuracy=accuracy_score(Y_train, X_train_pred)

In [57]:
print('Accuracy score on the training data :', training_accuracy)

Accuracy score on the training data : 0.8102458333333333


In [58]:
X_test_pred=model.predict(X_test)
testing_accuracy=accuracy_score(Y_test, X_test_pred)

In [59]:
print('Accuracy score on the test data :', testing_accuracy)

Accuracy score on the test data : 0.778375


Model Accuracy = 77.8 %

## Saving the trained model

In [60]:
import pickle

In [61]:
filename='trained_model.sav'
pickle.dump(model,open(filename,'wb')) ##writing in binary format


## Using a saved model for future predictions

In [62]:
#loading the saved model
loaded_model=pickle.load(open('trained_model.sav','rb'))

In [63]:
X_new = X_test[200]
print(Y_test[200])

prediction=model.predict(X_new)
print(prediction)

if(prediction[0]==0):
    print('Negative Tweet')
else:
    print('Positive Tweet')

1
[1]
Positive Tweet


In [64]:
X_new = X_test[3]
print(Y_test[3])

prediction=model.predict(X_new)
print(prediction)

if(prediction[0]==0):
    print('Negative Tweet')
else:
    print('Positive Tweet')

0
[0]
Negative Tweet
