In [1]:
! pip install kaggle

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install nltk
!pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Data Processing

In [5]:
twitter_data = pd.read_csv(r"C:\Users\hp\Downloads\archive\twitter_validation.csv")
twitter_data

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [6]:
# checking the number of rows and columns
twitter_data.shape

(999, 4)

In [7]:
# printing the first 10 rows of the datafrme
twitter_data.head(10)

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
5,7925,MaddenNFL,Positive,Thank you @EAMaddenNFL!! \n\nNew TE Austin Hoo...
6,11332,TomClancysRainbowSix,Positive,"Rocket League, Sea of Thieves or Rainbow Six: ..."
7,1107,AssassinsCreed,Positive,my ass still knee-deep in Assassins Creed Odys...
8,2069,CallOfDuty,Negative,FIX IT JESUS ! Please FIX IT ! What In the wor...
9,3185,Dota2,Positive,The professional dota 2 scene is fucking explo...


In [8]:
# for changing the columns names
twitter_data.rename(columns={'3364' : 'id', 'Facebook' : 'company', 'Irrelevant' : 'sentiment', 'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣' : 'text'}, inplace=True)

In [9]:
twitter_data

Unnamed: 0,id,company,sentiment,text
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...,...,...,...
994,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
995,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
997,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [10]:
# counting the number of missing values in the dataset
twitter_data.isnull().sum()

id           0
company      0
sentiment    0
text         0
dtype: int64

In [11]:
# checking the distribution of sentiment column
twitter_data['sentiment'].value_counts()

sentiment
Neutral       285
Positive      277
Negative      266
Irrelevant    171
Name: count, dtype: int64

### Stemming

In [12]:
port_stem = PorterStemmer()

In [13]:
def stemming(content):

    stemmed_content = re.sub('[^a-zA-Z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [14]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [15]:
twitter_data.head()

Unnamed: 0,id,company,sentiment,text,stemmed_content
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...,bbc news amazon boss jeff bezo reject claim co...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...,microsoft pay word function poorli samsungu ch...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,...",csgo matchmak full closet hack truli aw game
3,4433,Google,Neutral,Now the President is slapping Americans in the...,presid slap american face realli commit unlaw ...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...,hi eahelp madelein mccann cellar past year lit...


In [16]:
# separating the data and label
X = twitter_data['stemmed_content'].values
Y = twitter_data['sentiment'].values

In [17]:
print(X)

['bbc news amazon boss jeff bezo reject claim compani act like drug dealer bbc co uk news av busin'
 'microsoft pay word function poorli samsungu chromebook'
 'csgo matchmak full closet hack truli aw game'
 'presid slap american face realli commit unlaw act acquitt discov googl vanityfair com news'
 'hi eahelp madelein mccann cellar past year littl sneaki thing escap whilst load fifa point took card use paypal account work help resolv pleas'
 'thank eamaddennfl new te austin hooper orang brown brown austinhoop pic twitter com grg xzfkon'
 'rocket leagu sea thiev rainbow six sieg love play three stream best stream twitch rocketleagu seaofthiev rainbowsixsieg follow'
 'ass still knee deep assassin creed odyssey way anytim soon lmao'
 'fix jesu pleas fix world go playstat askplayst playstationsup treyarch callofduti neg silver wolf error code pic twitter com ziryhrf q'
 'profession dota scene fuck explod complet welcom get garbag'
 'itch assassin tccgif assassinscreedblackflag assassinscr

In [18]:
print(Y)

['Neutral' 'Negative' 'Negative' 'Neutral' 'Negative' 'Positive'
 'Positive' 'Positive' 'Negative' 'Positive' 'Positive' 'Negative'
 'Neutral' 'Negative' 'Positive' 'Positive' 'Negative' 'Positive'
 'Negative' 'Negative' 'Neutral' 'Irrelevant' 'Negative' 'Neutral'
 'Neutral' 'Negative' 'Irrelevant' 'Irrelevant' 'Negative' 'Positive'
 'Positive' 'Negative' 'Positive' 'Negative' 'Neutral' 'Neutral'
 'Irrelevant' 'Positive' 'Neutral' 'Positive' 'Neutral' 'Neutral'
 'Neutral' 'Positive' 'Neutral' 'Negative' 'Negative' 'Negative' 'Neutral'
 'Positive' 'Negative' 'Negative' 'Positive' 'Positive' 'Positive'
 'Positive' 'Positive' 'Negative' 'Irrelevant' 'Negative' 'Positive'
 'Positive' 'Irrelevant' 'Negative' 'Neutral' 'Negative' 'Irrelevant'
 'Neutral' 'Negative' 'Positive' 'Negative' 'Negative' 'Positive'
 'Positive' 'Irrelevant' 'Positive' 'Irrelevant' 'Neutral' 'Neutral'
 'Neutral' 'Positive' 'Positive' 'Neutral' 'Positive' 'Neutral' 'Negative'
 'Irrelevant' 'Negative' 'Neutral' 'Neutral

#### Splitting the dataset

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)

In [20]:
print(X.shape, X_train.shape, X_test.shape)

(999,) (799,) (200,)


In [21]:
print(X_train)

['googl name horror aesthet exist enough use middl name pic twitter com nkkxdsw'
 'let agre complet trash inevit facebook appl googl commerci come show peopl high five hug pride keep us connect'
 'amaz deal lenovo legion th gen intel core inch fhd game laptop gb gb ssd nvidia gtx gb graphic window black kg sy ckin rs amazon shop ekaro enkr http co wg qt fs'
 'cool' 'red dead redempt pic twitter com xblcpjlxl'
 'let stop use good popular song shitti fortnit montag pleas disgrac song'
 'leagu legend taught come term disappoint third novel'
 'nigger peopl johnson johnson'
 'today fun watch interact overwatch leagu long time' 'pro club fuck kit'
 'sick sidemen stan oh mean peopl call bullshit fuck lazi post import petit speak spread awar import movement time post fuck fifa act'
 'done bro stuff got corrupt download game updat ksupport nba k ronni k'
 'good playoverwatch matchmak competit shitti trump dick fuck usa ass'
 'massiv success indieapril happi saw two review amazon today made crap

In [22]:
print(X_test)

['finish assassin creed odyssey shadow offici shadow na begin end thank amaz game experi servic'
 'half polit account interact month three nvidia account two even repli person know irl second row happen pic twitter com nvquiuykoj'
 'u crazi gloat u r fan n respons journo sorri expect u chang drummer never chang u write book u worshipp'
 'lol look someth listen upbeat first recommend heart go nice job googl'
 'seri x fuck huge wtf'
 'congratul ninaman public day hope fabul day book fli'
 'azur microsoft unlock full potenti smart build ecosystem azur microsoft com blog microsoft'
 'flank kill battlefieldv bfv ps share store playstat com tid cusa http co auovyejztc'
 'illiter commun think pubg game killer must ban knive gun limit vehicl speed km h build countri less ft height also caus thousand death per year unbanpubg'
 'pic cute'
 'catch xbox game podcast right intromediagam com home xbox one best xbox podcast xbox xboxseriesx xboxseriess http co cmlrjsgzyk'
 'rue beauti fortnit fortnit

In [23]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()

X_train  = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [24]:
print(X_train)

  (0, 2130)	0.3176978688367646
  (0, 567)	0.1060022317788363
  (0, 3299)	0.12529660528535896
  (0, 2353)	0.12943043731232748
  (0, 1968)	0.2760608683499763
  (0, 3367)	0.22613902716262255
  (0, 948)	0.2677760276494109
  (0, 1006)	0.2760608683499763
  (0, 42)	0.29927320260758145
  (0, 1443)	0.29927320260758145
  (0, 2081)	0.5985464052151629
  (0, 1282)	0.20671562143088326
  (1, 606)	0.23798945284043796
  (1, 3364)	0.18035375696294498
  (1, 1676)	0.20676331134352516
  (1, 2441)	0.26598297974467894
  (1, 1460)	0.28235814324658864
  (1, 1108)	0.2543646163423477
  (1, 1406)	0.2453527135568873
  (1, 2327)	0.18035375696294498
  (1, 2850)	0.209995925936197
  (1, 571)	0.18372121683094678
  (1, 576)	0.28235814324658864
  (1, 142)	0.2317639046104568
  (1, 1024)	0.17166624244712334
  :	:
  (797, 2014)	0.212739281364705
  (797, 621)	0.212739281364705
  (797, 3461)	0.19623872178992954
  (797, 842)	0.19623872178992954
  (797, 3229)	0.19623872178992954
  (797, 3069)	0.1707701723641069
  (797, 3560)	0.

In [25]:
print(X_test)

  (0, 3161)	0.2188333621780776
  (0, 2808)	0.29318185436706
  (0, 2195)	0.32177180493208934
  (0, 2190)	0.3010571867126435
  (0, 2079)	0.3364690604548209
  (0, 1200)	0.14275478159507407
  (0, 1098)	0.2803425684931975
  (0, 1009)	0.2803425684931975
  (0, 937)	0.274959854139701
  (0, 660)	0.2509480574477346
  (0, 265)	0.3364690604548209
  (0, 173)	0.2509480574477346
  (0, 91)	0.2577699806248826
  (1, 3300)	0.22939681007824014
  (1, 3299)	0.11284744587715613
  (1, 3186)	0.24863214632970512
  (1, 2783)	0.21636379454934096
  (1, 2697)	0.2861321978931054
  (1, 2404)	0.25776450398567274
  (1, 2353)	0.11657055062421744
  (1, 2335)	0.24863214632970512
  (1, 2168)	0.18276440085887227
  (1, 2030)	0.2348617066820835
  (1, 1714)	0.18115925664597263
  (1, 1586)	0.26953816072532133
  :	:
  (195, 1391)	0.33871355948033494
  (195, 1143)	0.21635027870692866
  (195, 948)	0.2854893921722796
  (195, 791)	0.30513284871661756
  (195, 536)	0.33871355948033494
  (195, 25)	0.24803087437058444
  (196, 3054)	0.36

#### Training the Machine Learning Model

##### Logistic Regression

In [26]:
model = LogisticRegression(max_iter=1000)

In [27]:
model.fit(X_train, Y_train)

##### Accuracy Score

In [28]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [30]:
print('accuracy score on the training data:',training_data_accuracy)

accuracy score on the training data: 0.951188986232791


In [31]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [32]:
print('accuracy score on the test data:',test_data_accuracy)

accuracy score on the test data: 0.49


##### Saving the trained model

In [33]:
import pickle

In [34]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename,'wb'))