In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords #natural language toolkit
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer #converts textual data to numerical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:992)>


False

In [5]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
twitter_data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1')

In [7]:
twitter_data.shape

(1599999, 6)

In [8]:
twitter_data.head(3)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire


In [9]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('training.1600000.processed.noemoticon.csv', names= column_names, encoding = 'ISO-8859-1')

In [10]:
twitter_data.head(3)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


In [11]:
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [12]:
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [13]:
#Converting 4 to 1. This implies negative is 0, positive is 1
twitter_data.replace({'target':{4:1}}, inplace = True)
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [14]:
port_stem = PorterStemmer()

In [15]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content) #remove everything not aplhabets and spaces
  stemmed_content = stemmed_content.lower() #convert to lower
  stemmed_content = stemmed_content.split() #split them and load to list
  stemmed_content = [port_stem.stem(word)for word in stemmed_content if not word in stopwords.words('english')] #check if the words are in stopped words(ignore)
  stemmed_content = ' '.join(stemmed_content) #join the words of tweet together
  return stemmed_content

In [16]:
twitter_data['stemmed_content']= twitter_data['text'].apply(stemming)

In [17]:
twitter_data.head(3)

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...


In [18]:
#Separate the data from the labels.
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [19]:
#Split to Training data and Test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

In [20]:
print(X.shape, X_train.shape, X_test.shape)
print(Y.shape, Y_train.shape, Y_test.shape)

(1600000,) (1280000,) (320000,)
(1600000,) (1280000,) (320000,)


In [21]:
#converting the textual data to numerical data
vectorizer =  TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [22]:
print(X_train)

  (0, 436713)	0.27259876264838384
  (0, 354543)	0.3588091611460021
  (0, 185193)	0.5277679060576009
  (0, 109306)	0.3753708587402299
  (0, 235045)	0.41996827700291095
  (0, 443066)	0.4484755317023172
  (1, 160636)	1.0
  (2, 109306)	0.4591176413728317
  (2, 124484)	0.1892155960801415
  (2, 407301)	0.18709338684973031
  (2, 129411)	0.29074192727957143
  (2, 406399)	0.32105459490875526
  (2, 433560)	0.3296595898028565
  (2, 77929)	0.31284080750346344
  (2, 443430)	0.3348599670252845
  (2, 266729)	0.24123230668976975
  (2, 409143)	0.15169282335109835
  (2, 178061)	0.1619010109445149
  (2, 150715)	0.18803850583207948
  (2, 132311)	0.2028971570399794
  (2, 288470)	0.16786949597862733
  (3, 406399)	0.29029991238662284
  (3, 158711)	0.4456939372299574
  (3, 151770)	0.278559647704793
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 318303)	0.21254698865277744
  (1279996, 434014)	0.27189450523324465
  (1279996, 390130)	0.2206474219107611
  (1279996, 373144)	0.35212500999832036
  (1279996, 23807

In [23]:
print(X_test)

  (0, 15110)	0.1719352837797837
  (0, 31168)	0.16247724180521766
  (0, 67828)	0.26800375270827315
  (0, 106069)	0.3655545001090455
  (0, 132364)	0.25525488955578596
  (0, 138164)	0.23688292264071403
  (0, 171378)	0.2805816206356073
  (0, 271016)	0.4535662391658828
  (0, 279082)	0.1782518010910344
  (0, 388348)	0.21985076072061738
  (0, 398906)	0.3491043873264267
  (0, 409143)	0.31430470598079707
  (0, 420984)	0.17915624523539803
  (1, 6463)	0.30733520460524466
  (1, 15110)	0.211037449588008
  (1, 145393)	0.575262969264869
  (1, 217562)	0.40288153995289894
  (1, 256777)	0.28751585696559306
  (1, 348135)	0.4739279595416274
  (1, 366203)	0.24595562404108307
  (2, 22532)	0.3532582957477176
  (2, 34401)	0.37916255084357414
  (2, 89448)	0.36340369428387626
  (2, 183312)	0.5892069252021465
  (2, 256834)	0.2564939661498776
  :	:
  (319994, 443794)	0.2782185641032538
  (319995, 107868)	0.3339934973754696
  (319995, 109379)	0.30208964848908326
  (319995, 155493)	0.2770682832971668
  (319995, 213

In [24]:
from sklearn.linear_model import LogisticRegression

#Training the ML model based on LR
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)

#Model Evaluation
#Accuracy score on taining data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Logistic Regression accuracy score on training data:', training_data_accuracy)

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Logistic Regression accuracy score on test data:', test_data_accuracy)

Logistic Regression accuracy score on training data: 0.79871953125
Logistic Regression accuracy score on test data: 0.77668125


In [35]:
import pickle
filename = 'trained_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [28]:
from sklearn.ensemble import RandomForestClassifier
import time

# Training the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=2, n_jobs=-1)
start_time = time.time()
rf_model.fit(X_train, Y_train)
training_time = time.time() - start_time

start_time = time.time()
X_test_prediction = rf_model.predict(X_test)
prediction_time = time.time() - start_time

training_data_accuracy = accuracy_score(Y_train, rf_model.predict(X_train))
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

print('Random Forest training time:', training_time)
print('Random Forest prediction time:', prediction_time)
print('Random Forest Accuracy score on training data:', training_data_accuracy)
print('Random Forest Accuracy score on test data:', test_data_accuracy)


Random Forest training time: 10274.999679088593
Random Forest prediction time: 38.40966606140137
Random Forest Accuracy score on training data: 0.99584453125
Random Forest Accuracy score on test data: 0.774928125


In [30]:
import pickle
filename = 'trained_rf_model.pkl'
pickle.dump(rf_model, open(filename, 'wb'))

In [37]:
#loading the saved model for future predictions. Example loading Logistic Regression model
loaded_model = pickle.load(open('trained_model.pkl', 'rb'))
X_new = X_test[200]
print(Y_test[200])

prediction = model.predict(X_new)
print(prediction)

if(prediction[0]==0):
    print('negative tweet')
else: 
    print('positive tweet')

1
[1]
positive tweet
