In [3]:
import pandas as pd
import numpy as np

In [4]:
import chardet

# Read a portion of the file to detect encoding
with open('test_sentiment_dataset.csv', 'rb') as f:
    result = chardet.detect(f.read(100000))  # Read the first 100,000 bytes
    print(result)


{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [5]:
texts=pd.read_csv('test_sentiment_dataset.csv',encoding='ISO-8859-1')

In [6]:
texts

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0
...,...,...,...,...,...,...,...,...,...
4810,,,,,,,,,
4811,,,,,,,,,
4812,,,,,,,,,
4813,,,,,,,,,


In [7]:
texts = texts.drop(columns=['textID','Time of Tweet','Age of User','Country','Population -2020','Land Area (Km²)','Density (P/Km²)'])

In [8]:
texts

Unnamed: 0,text,sentiment
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive
...,...,...
4810,,
4811,,
4812,,
4813,,


In [13]:
texts.shape

(4815, 2)

In [14]:
texts.sentiment.value_counts()

sentiment
neutral     1430
positive    1103
negative    1001
Name: count, dtype: int64

In [9]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [161]:
lemmatizer=WordNetLemmatizer()

In [162]:
corpus=[]

In [163]:
for i in range(len(texts)):
    review = texts['text'][i]
    if isinstance(review, str):  # Ensure the text is a string
        review = re.sub('[^a-zA-Z]', ' ', review, flags=re.MULTILINE)  # Use correct regular expression substitution
        review = re.sub('http\S+|www\S+|https\S+', '<URL>', review)  # Use correct regular expression substitution for URLs
        review = review.lower()
        review = review.split()
        review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    else:
        corpus.append('')  # Append empty string for non-string entries

In [164]:
corpus

['last session day http twitpic com ezh',
 'shanghai also really exciting precisely skyscraper galore good tweeps china sh bj',
 'recession hit veronique branquinho quit company shame',
 'happy bday',
 'http twitpic com w p like',
 'great weee visitor',
 'think everyone hate lol',
 'soooooo wish could im school myspace completely blocked',
 'within short time last clue',
 'get day alright done anything yet leaving soon stepsister though',
 'bike put hold known argh total bummer',
 'checked win',
 'twitter tavern bore much',
 'va weekend youngest son turn tomorrow make kinda sad getting big check twipics',
 'coming socket feel like phone hole virgin loose',
 'hot today like hate new timetable bad week',
 'miss',
 'cramp',
 'guy say hi answer question yesterday nice song',
 'going spiritual stagnentation exploding ego realise great ok',
 'stupid storm river u tonight',
 'dead grandpa pay attention',
 'need retail therapy bad ahhh gimme money geebus',
 'go sleep',
 'lame go make breakfast

In [165]:
#Create TF-IDF VECTORIZER


In [166]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [167]:
vectorized_counter = TfidfVectorizer()

In [168]:
x = vectorized_counter.fit_transform(corpus).toarray()

In [169]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [170]:
x.shape

(4815, 6359)

In [171]:
y= pd.get_dummies(texts['sentiment'])

In [172]:
y

Unnamed: 0,negative,neutral,positive
0,False,True,False
1,False,False,True
2,True,False,False
3,False,False,True
4,False,False,True
...,...,...,...
4810,False,False,False
4811,False,False,False
4812,False,False,False
4813,False,False,False


In [173]:
from sklearn.model_selection import train_test_split

In [174]:
x.shape

(4815, 6359)

In [175]:
y.shape

(4815, 3)

In [176]:
y.isnull().sum()

negative    0
neutral     0
positive    0
dtype: int64

In [177]:
y=y.iloc[:,1].values

In [178]:
y

array([ True, False, False, ..., False, False, False])

In [179]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [180]:
x_train.shape

(3852, 6359)

In [181]:
y_train.shape

(3852,)

In [182]:
x_test.shape

(963, 6359)

In [183]:
y_test.shape

(963,)

In [184]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix

In [185]:
model=LogisticRegression()

In [186]:
model.fit(x_train,y_train)

In [187]:
y_pred = model.predict(x_test)

In [188]:
y_pred

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False,  True, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [189]:
matrix=confusion_matrix(y_test,y_pred)

In [190]:
matrix

array([[655,  20],
       [259,  29]], dtype=int64)

In [191]:
accuracy= accuracy_score(y_test,y_pred)

In [192]:
accuracy

0.7102803738317757

In [193]:
from sklearn.naive_bayes import MultinomialNB

In [194]:
NB = MultinomialNB().fit(x_train,y_train)

In [195]:
y_pred = NB.predict(x_test)

In [196]:
y_pred

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [197]:
accuracy = accuracy_score(y_test,y_pred)

In [199]:
accuracy

0.7040498442367601