In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score




In [2]:
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
#reading csv file
TD = pd.read_csv('dataset.csv' ,  encoding='ISO-8859-1')


In [5]:
#checking the data
print(TD.shape)
print(TD.head)

(1599999, 6)
<bound method NDFrame.head of          0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  \
0        0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
1        0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
2        0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
3        0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4        0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY   
...     ..         ...                           ...       ...   
1599994  4  2193601966  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599995  4  2193601969  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599996  4  2193601991  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599997  4  2193602064  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599998  4  2193602129  Tue Jun 16 08:40:50 PDT 2009  NO_QUERY   

         _TheSpecialOne_  \
0          scotthamilton   
1               mattycus   
2                ElleCTF   
3                 Karoli   
4               joy_wolf

In [6]:
#renaming the column
C_N = ['target','id','date','flag','user','text']
TD = pd.read_csv('dataset.csv' , names = C_N , encoding='ISO-8859-1')
print(TD.shape)

(1600000, 6)


In [7]:
#missing values
print(TD.isnull().sum())

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64


In [8]:
#checking the distribution of target colunmn
print(TD['target'].value_counts())

target
0    800000
4    800000
Name: count, dtype: int64


In [9]:
#converting the target '4' to '1' so that there are only positive(1) and negative(0)
TD.replace({'target':{4:1}} , inplace = True)
print(TD['target'].value_counts())

target
0    800000
1    800000
Name: count, dtype: int64


In [10]:
#Stemming :- process of reducing words to its rootwords
port = PorterStemmer()

In [11]:
def stemming(content):
    stemmed = re.sub('[^a-zA-Z]',' ',content)
    stemmed = stemmed.lower()
    stemmed = stemmed.split()
    stemmed = [port.stem(word) for word in stemmed if not word in stopwords.words('english')]
    stemmed = ' '.join(stemmed)
    
    return stemmed

In [12]:
# 64 minutes time to process
TD['stemmed'] = TD['text'].apply(stemming)

In [13]:
print(TD.head)

<bound method NDFrame.head of          target          id                          date      flag  \
0             0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1             0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2             0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3             0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4             0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
...         ...         ...                           ...       ...   
1599995       1  2193601966  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599996       1  2193601969  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599997       1  2193601991  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599998       1  2193602064  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599999       1  2193602129  Tue Jun 16 08:40:50 PDT 2009  NO_QUERY   

                    user                                               text  \
0        _TheSpecialOne_  @switchfoot 

In [14]:
print(TD['stemmed'])
print(TD['target'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed, Length: 1600000, dtype: object
0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [15]:
#separating the data and label
X = TD['stemmed'].values
Y = TD['target'].values
print(X)
print(Y)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']
[0 0 0 ... 1 1 1]


In [16]:
#splitting train and test data
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size = 0.2 , stratify = Y, random_state = 2)

In [17]:
print(X.shape , X_train.shape , X_test.shape)

(1600000,) (1280000,) (320000,)


In [18]:
print(Y.shape , Y_train.shape , Y_test.shape)

(1600000,) (1280000,) (320000,)


In [19]:
#converting the text data to numerical data
vect = TfidfVectorizer()

In [20]:
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [21]:
print(X_train)
print(X_test)

  (0, 443068)	0.4484755317023172
  (0, 235046)	0.41996827700291095
  (0, 109306)	0.3753708587402299
  (0, 185194)	0.5277679060576009
  (0, 354545)	0.3588091611460021
  (0, 436715)	0.27259876264838384
  (1, 160636)	1.0
  (2, 288472)	0.16786949597862733
  (2, 132311)	0.2028971570399794
  (2, 150715)	0.18803850583207948
  (2, 178062)	0.1619010109445149
  (2, 409145)	0.15169282335109835
  (2, 266730)	0.24123230668976975
  (2, 443432)	0.3348599670252845
  (2, 77929)	0.31284080750346344
  (2, 433562)	0.3296595898028565
  (2, 406401)	0.32105459490875526
  (2, 129411)	0.29074192727957143
  (2, 407303)	0.18709338684973031
  (2, 124484)	0.1892155960801415
  (2, 109306)	0.4591176413728317
  (3, 172422)	0.37464146922154384
  (3, 411530)	0.27089772444087873
  (3, 388628)	0.3940776331458846
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 390132)	0.22064742191076112
  (1279996, 434016)	0.2718945052332447
  (1279996, 318305)	0.21254698865277746
  (1279996, 237900)	0.2236567560099234
  (1279996, 2910

In [22]:
#training the logistic regression model
model = LogisticRegression(max_iter = 1000)

In [23]:
model.fit(X_train , Y_train)

In [24]:
#model evaluation (accuracy score)
#training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train , X_train_prediction)

In [25]:
print(training_data_accuracy)

0.79871875


In [26]:
#test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test , X_test_prediction)

In [27]:
print(test_data_accuracy)

0.776665625


In [33]:
#saving the trained model
import pickle
MODEL = 'trained_model.sav'
pickle.dump(model , open(MODEL , 'wb'))

In [None]:
vec_file = ('Vect.pickle')
pickle.dump(vect, open(vec_file , 'wb'))

In [29]:
#using the saved model
loaded_model = pickle.load(open('trained_model.sav' , 'rb'))

In [30]:
X_new = X_test[200]
prediction = loaded_model.predict(X_new)

if(prediction == 0):
    print('Negative tweet')
else:
    print('Positive tweet')

Positive tweet


In [31]:
X_test[200]

<1x461490 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [32]:
INP = input()
X_new = vect.transform([INP])
 
prediction = loaded_model.predict(X_new)

if(prediction == 0):
    print('Negative tweet')
else:
    print('Positive tweet')

Positive tweet
