In [1]:
#Opening the sentiment analyzed reddit dataset
import pandas as pd
import numpy as np
from google.colab import files

uploaded = files.upload()
df = pd.read_csv("sentiment_data.csv")
df.head()

Saving sentiment_data.csv to sentiment_data.csv


Unnamed: 0,id,Sentiment,Post
0,0,1,Changes to r/RandomThoughts are coming Hey fri...
1,1,1,Pretend it's Monday...I didn't lie 😅 We have p...
2,2,0,"The word ""dick"" sounds small and the word ""coc..."
3,3,1,How many pillows do you have on your bed? Incl...
4,4,0,What's the longest time you've gone without sl...


## Data Cleaning

In [2]:
df.shape

(909, 3)

In [3]:
#checking duplications
df.duplicated().sum()

0

In [4]:
#checking if there are missing values
df.isnull().sum()

Unnamed: 0,0
id,0
Sentiment,0
Post,0


In [5]:
#importing necessary libraries
import re
import string

#convert uppercae to lowercase
df['Post'] = df['Post'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#remove links
df['Post'] = df['Post'].apply(lambda x: " ".join(re.sub("https?:\/\/.*[\r\n]*","",x, flags=re.MULTILINE) for x in x.split()))

#remove user mentions
df['Post'] = df['Post'].apply(lambda x: " ".join(re.sub("@[A-Za-z0-9]+","",x, flags=re.MULTILINE) for x in x.split()))

#remove numbers
df["Post"] = df["Post"].str.replace('\d+', '', regex = True)

In [6]:
df.head(10)

Unnamed: 0,id,Sentiment,Post
0,0,1,changes to r/randomthoughts are coming hey fri...
1,1,1,pretend it's monday...i didn't lie 😅 we have p...
2,2,0,"the word ""dick"" sounds small and the word ""coc..."
3,3,1,how many pillows do you have on your bed? incl...
4,4,0,what's the longest time you've gone without sl...
5,5,0,"i miss the name ""dick"". it's a great boys' nam..."
6,6,0,saying “cock” instead of “dick” is so cringe /...
7,7,1,do you pee in the shower while showering?
8,8,0,"nobody will love you, unless you are usable."
9,9,0,i have the most sensitive nipples in the world...


In [7]:
#remove punctuations

def remove_punctuations(text):
  for punctuation in string.punctuation:
    text=text.replace(punctuation, '')
  return text

df["Post"] = df["Post"].apply(remove_punctuations)

df.head()

Unnamed: 0,id,Sentiment,Post
0,0,1,changes to rrandomthoughts are coming hey frie...
1,1,1,pretend its mondayi didnt lie 😅 we have plans ...
2,2,0,the word dick sounds small and the word cock s...
3,3,1,how many pillows do you have on your bed inclu...
4,4,0,whats the longest time youve gone without slee...


In [8]:
pip install nltk



In [9]:
#remove stopwords
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
sw = stopwords.words('english')
df['Post'] = df['Post'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
df.head()

Unnamed: 0,id,Sentiment,Post
0,0,1,changes rrandomthoughts coming hey friends inf...
1,1,1,pretend mondayi didnt lie 😅 plans place begin ...
2,2,0,word dick sounds small word cock sounds huge
3,3,1,many pillows bed including throw pillows
4,4,0,whats longest time youve gone without sleep th...


In [11]:
#stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
df["Post"] = df["Post"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

df.head()

Unnamed: 0,id,Sentiment,Post
0,0,1,chang rrandomthought come hey friend inform pr...
1,1,1,pretend mondayi didnt lie 😅 plan place begin u...
2,2,0,word dick sound small word cock sound huge
3,3,1,mani pillow bed includ throw pillow
4,4,0,what longest time youv gone without sleep thin...


In [12]:
#tokenization
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')
df['Post'] = df['Post'].apply(lambda x: word_tokenize(x))

df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,id,Sentiment,Post
0,0,1,"[chang, rrandomthought, come, hey, friend, inf..."
1,1,1,"[pretend, mondayi, didnt, lie, 😅, plan, place,..."
2,2,0,"[word, dick, sound, small, word, cock, sound, ..."
3,3,1,"[mani, pillow, bed, includ, throw, pillow]"
4,4,0,"[what, longest, time, youv, gone, without, sle..."


In [13]:
#lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
df["Post"] = df["Post"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df.head()


[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,id,Sentiment,Post
0,0,1,"[chang, rrandomthought, come, hey, friend, inf..."
1,1,1,"[pretend, mondayi, didnt, lie, 😅, plan, place,..."
2,2,0,"[word, dick, sound, small, word, cock, sound, ..."
3,3,1,"[mani, pillow, bed, includ, throw, pillow]"
4,4,0,"[what, longest, time, youv, gone, without, sle..."


### Model Training

In [14]:
#importing necessary libraries to train a Naive Bayes Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer


x = df["Post"].astype(str)
y = df['Sentiment'].values

In [15]:
#splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 42 )

In [16]:
#Converting Post data into numerical data
vector = TfidfVectorizer()
X_train = vector.fit_transform(X_train)
X_test = vector.transform(X_test)

In [17]:
#Creating the multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)


In [18]:
#Getting the accuracy of the model
X_train_pred = model.predict(X_train)
accuracy = accuracy_score(y_train, X_train_pred)

X_test_pred = model.predict(X_test)
accuracy_scr = accuracy_score(y_test, X_test_pred)


In [19]:
#Results
print(accuracy*100)
print(accuracy_scr*100)

84.90566037735849
55.67765567765568


In [20]:
#Saving model
!pip install joblib

import joblib
joblib.dump(model, 'sentiment_model.pkl')



['sentiment_model.pkl']