In [5]:
import pandas as pd

path = "training.csv"
df = pd.read_csv(path, encoding = "ISO-8859-1")

df.head()

Unnamed: 0,sentiment,id of the tweet,date of the tweet,query,user,tweet
0,0,1167810672,Mon Apr 06 22:19:19 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1167810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1167811181,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1167811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1167811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [6]:
import re

import string

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

#drop empty values
df.dropna(subset= ['tweet'], inplace = True)

def removeLinks(text):
    text = re.sub(r'https?://\S+|www\.\S+','',str(text)) 
    return text

def removePunctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def removeStoppingWords(text):
    nlp = stopwords.words("english")
    text = ' '.join([word for word in text.split() if word not in nlp])
    return text

def convertToStem(text):
    port_stem = PorterStemmer()
    content = re.sub('[^a-zA-Z]', ' ', text)#removing all values that is not alphabet
    content = content.split()
    
    content = ' '.join([port_stem.stem(word) for word in content])
    
    return content

In [7]:
df['stemmed_data'] = df['tweet']
#remove links
df['stemmed_data'] = df['stemmed_data'].apply(removeLinks)
#text to lower
df['stemmed_data'] = df['stemmed_data'].str.lower()
#remove Stopping words
df['stemmed_data'] = df['stemmed_data'].apply(removeStoppingWords)
#remove punctuation
df['stemmed_data'] = df['stemmed_data'].apply(removePunctuation)
#convert to stem
df['stemmed_data'] = df['stemmed_data'].apply(convertToStem)


df['stemmed_data'].head(20)

0     upset cant updat facebook text it might cri re...
1     kenichan dive mani time ball manag save rest g...
2                       whole bodi feel itchi like fire
3     nationwideclass no behav all im mad here cant ...
4                                   kwesidei whole crew
5                                              need hug
6     loltrish hey long time see ye rain bit onli bi...
7                                           repierc ear
8       caregiv bear watch it thought ua loss embarrass
9           octolinz count idk either never talk anymor
10    smarrison wouldv first gun realli though zac s...
11    iamjazzyfizzl wish got watch you miss iamlilni...
12    holli death scene hurt sever watch film wri di...
13                                             file tax
14    lettya ahh ive alway want see rent love soundt...
15    fakerpattypattz oh dear drink forgotten tabl d...
16                          alydesign day get much done
17    one friend call me ask meet mid valley tod

In [8]:
#Splitting data(training and test)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [9]:
X = df['stemmed_data'].values
Y = df['sentiment'].values


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [11]:
#vectorizing data
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [12]:
#training model

model = LogisticRegression(max_iter=1000)

In [13]:
model.fit(X_train, Y_train)

In [14]:
#accuracy score for training data
X_train_prediction = model.predict(X_train)
training_data_accuracy =  accuracy_score(Y_train, X_train_prediction)
training_data_accuracy

0.8595374420193191

In [15]:
#accuracy score for test data
X_test_prediction = model.predict(X_test)
test_data_accuracy =  accuracy_score(Y_test, X_test_prediction)
test_data_accuracy

0.834627947452495

In [16]:
#saving model
import pickle
pickle.dump(model, open("trained_model.sav", 'wb'))
pickle.dump(vectorizer, open("vectorizer.sav", 'wb'))
