In [None]:
#Opening the sentiment analyzed reddit dataset
import pandas as pd
import numpy as np
from google.colab import files

uploaded = files.upload()
df = pd.read_csv("sentiment_data1.csv")
df.head()

Saving sentiment_data1.csv to sentiment_data1.csv


Unnamed: 0,id,Post,Sentiment,Score
0,0,What age group are you in? I don't know how to...,Negative,0
1,1,Changes to r/RandomThoughts are coming Hey fri...,Neutral,1
2,2,Orgasms seem kind of amazing. Not in the way t...,Positive,2
3,3,Isn't following an alpha male the literal defi...,Negative,0
4,4,Butt is weird Each of us has these two spheres...,Neutral,1


## Data Cleaning

In [None]:
df.shape

(2299, 4)

In [None]:
#checking duplications
df.duplicated().sum()

0

In [None]:
#checking if there are missing values
df.isnull().sum()

Unnamed: 0,0
id,0
Post,0
Sentiment,0
Score,0


In [None]:
#importing necessary libraries
import re
import string

#convert uppercae to lowercase
df['Post'] = df['Post'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#remove links
df['Post'] = df['Post'].apply(lambda x: " ".join(re.sub("https?:\/\/.*[\r\n]*","",x, flags=re.MULTILINE) for x in x.split()))

#remove user mentions
df['Post'] = df['Post'].apply(lambda x: " ".join(re.sub("@[A-Za-z0-9]+","",x, flags=re.MULTILINE) for x in x.split()))

#remove numbers
df["Post"] = df["Post"].str.replace('\d+', '', regex = True)

In [None]:
df.head(10)

Unnamed: 0,id,Post,Sentiment,Score
0,0,what age group are you in? i don't know how to...,Negative,0
1,1,changes to r/randomthoughts are coming hey fri...,Neutral,1
2,2,orgasms seem kind of amazing. not in the way t...,Positive,2
3,3,isn't following an alpha male the literal defi...,Negative,0
4,4,butt is weird each of us has these two spheres...,Neutral,1
5,5,the worlds obsessions with big butts kinda irk...,Negative,0
6,6,what do people get wrong about introverts? tha...,Negative,0
7,7,does anyone else get bored really easily? i ca...,Negative,0
8,8,every time you poop someone else in the world ...,Negative,0
9,9,"why are people told to ""be yourself"", yet soci...",Negative,0


In [None]:
#remove punctuations

def remove_punctuations(text):
  for punctuation in string.punctuation:
    text=text.replace(punctuation, '')
  return text

df["Post"] = df["Post"].apply(remove_punctuations)

df.head()

Unnamed: 0,id,Post,Sentiment,Score
0,0,what age group are you in i dont know how to p...,Negative,0
1,1,changes to rrandomthoughts are coming hey frie...,Neutral,1
2,2,orgasms seem kind of amazing not in the way th...,Positive,2
3,3,isnt following an alpha male the literal defin...,Negative,0
4,4,butt is weird each of us has these two spheres...,Neutral,1


In [None]:
pip install nltk



In [None]:
#remove stopwords
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
sw = stopwords.words('english')
df['Post'] = df['Post'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df.head()

Unnamed: 0,id,Post,Sentiment,Score
0,0,age group dont know phrase without sounding cr...,Negative,0
1,1,changes rrandomthoughts coming hey friends inf...,Neutral,1
2,2,orgasms seem kind amazing way feel sense body ...,Positive,2
3,3,isnt following alpha male literal definition beta,Negative,0
4,4,butt weird us two spheres behind walk carrying...,Neutral,1


In [None]:
#stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
df["Post"] = df["Post"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

df.head()

Unnamed: 0,id,Post,Sentiment,Score
0,0,age group dont know phrase without sound creep...,Negative,0
1,1,chang rrandomthought come hey friend inform pr...,Neutral,1
2,2,orgasm seem kind amaz way feel sens bodi make ...,Positive,2
3,3,isnt follow alpha male liter definit beta,Negative,0
4,4,butt weird us two sphere behind walk carri around,Neutral,1


In [None]:
#tokenization
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')
df['Post'] = df['Post'].apply(lambda x: word_tokenize(x))

df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,id,Post,Sentiment,Score
0,0,"[age, group, dont, know, phrase, without, soun...",Negative,0
1,1,"[chang, rrandomthought, come, hey, friend, inf...",Neutral,1
2,2,"[orgasm, seem, kind, amaz, way, feel, sens, bo...",Positive,2
3,3,"[isnt, follow, alpha, male, liter, definit, beta]",Negative,0
4,4,"[butt, weird, us, two, sphere, behind, walk, c...",Neutral,1


In [None]:
#lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
df["Post"] = df["Post"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df.head()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,id,Post,Sentiment,Score
0,0,"[age, group, dont, know, phrase, without, soun...",Negative,0
1,1,"[chang, rrandomthought, come, hey, friend, inf...",Neutral,1
2,2,"[orgasm, seem, kind, amaz, way, feel, sen, bod...",Positive,2
3,3,"[isnt, follow, alpha, male, liter, definit, beta]",Negative,0
4,4,"[butt, weird, u, two, sphere, behind, walk, ca...",Neutral,1


### Model Training

In [None]:
#importing necessary libraries to train a Naive Bayes Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


x = df["Post"].astype(str)
y = df['Sentiment'].values

In [None]:
#splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 42 )

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('naive_bayes', MultinomialNB())])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
#Getting the accuracy of the model
X_train_pred = pipeline.predict(X_train)
accuracy = accuracy_score(y_train, X_train_pred)

X_test_pred = pipeline.predict(X_test)
accuracy_scr = accuracy_score(y_test, X_test_pred)


In [None]:
#Results
print(accuracy*100)
print(accuracy_scr*100)

85.8297078931013
57.391304347826086


In [None]:
#Saving model
!pip install joblib

import joblib
joblib.dump(pipeline, 'sentiment_model.pkl')



['sentiment_model.pkl']

### Testing Model

In [None]:
post="""I rewatched this not that long ago, and even though I know it's pretty standard to watch comedies from this era and see a lot of things
that are like 'wow that didn't age well' (even if it's still an otherwise funny movie), but rewatching this, I was surprised at how the overall
messaging and way it depicts these characters felt a lot more fair and interesting than a lot of other films from that era.
"""

In [None]:
probabilities = pipeline.predict_proba([post])
prediction = pipeline.predict([post])
negative = probabilities[0][0]
neutral = probabilities[0][1]
positive = probabilities[0][2]

# Output the results
print(f"Negative: {negative * 100:.2f}%")
print(f"Neutral: {neutral * 100:.2f}%")
print(f"Positive: {positive * 100:.2f}%")
print(prediction)


Negative: 35.46%
Neutral: 21.45%
Positive: 43.09%
['Positive']
