In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

In [2]:
data_df = pd.read_csv("sentiments.csv");

In [3]:
data_df.sample(5)

Unnamed: 0,Sentiment,SentimentText,Unnamed: 2
32369,1,"@agchick 3:10 Yuma, ... fat guy in a little co...",
15409,0,*shudder* having to use IE to play on an onlin...,
11208,1,"#myweakness pot, beer &amp; women",
18315,1,@ the breakfast spot with dad and jon!! Good t...,
35200,1,@alyandaj woow ALy I'm so happy for you! you d...,


In [4]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39894 entries, 0 to 39893
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Sentiment      39894 non-null  int64 
 1   SentimentText  39894 non-null  object
 2   Unnamed: 2     11 non-null     object
dtypes: int64(1), object(2)
memory usage: 935.1+ KB


In [5]:
data_df.tail()

Unnamed: 0,Sentiment,SentimentText,Unnamed: 2
39889,0,@AmandaAzzarello see you soon i will cheer yo...,
39890,1,@amandabaybee09 chasers 9pm dont forget!! plu...,
39891,0,@amandabaybee09 i cant cause i have no way of ...,
39892,1,@amandabaybee09 okalie lots of help tomoro lo...,
39893,1,"@amandabcdefgh lol, same h",


In [6]:
rename_dict = {"SentimentText":"Message"}
data_df.rename(columns=rename_dict, inplace=True)

In [7]:
data_df.head(10)

Unnamed: 0,Sentiment,Message,Unnamed: 2
0,0,is so sad for my APL frie...,
1,0,I missed the New Moon trail...,
2,1,omg its already 7:30 :O,
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,
4,0,i think mi bf is cheating on me!!! ...,
5,0,or i just worry too much?,
6,1,Juuuuuuuuuuuuuuuuussssst Chillin!!,
7,0,Sunny Again Work Tomorrow :-| ...,
8,1,handed in my uniform today . i miss you ...,
9,1,hmmmm.... i wonder how she my number @-),


In [8]:
data_df = data_df[["Sentiment","Message"]]
data_df.head()

Unnamed: 0,Sentiment,Message
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [9]:
def cleaning(Message):
    import nltk
    from nltk.tokenize import word_tokenize
    html_text = BeautifulSoup(Message,"html.parser").get_text()
    
    letters = re.sub("[^a-zA-Z]", " ", html_text)    
    letters = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', html_text)
    letters = re.sub(' +',' ', html_text)
    letters = re.sub(r"\n", "", letters)    
    letters = re.sub(r"\r", "", letters) 
    letters = re.sub(r"[0-9]", "", letters)
    letters = re.sub(r"\'", "", letters)    
    letters = re.sub(r"\"", "", letters)
    
    letters = re.sub('\S*@\S*\s?', '', letters)
    letters = re.sub('\s+', ' ', letters)
    letters = re.sub(r"\'", "", letters)
    
    tokens = nltk.word_tokenize(letters)
    
    
    return " ".join(tokens)

In [10]:
data_df["cleaned_message"] = data_df["Message"].apply(cleaning)
data_df.sample(5)

Unnamed: 0,Sentiment,Message,cleaned_message
15263,0,*backache*,*backache*
10482,0,#hiccup cures help!!! *hiccup*,# hiccup cures help ! ! ! *hiccup*
29276,1,@aconfras yep when you have money for that... ...,yep when you have money for that ... but I thi...
17684,1,@_ashesandwine done! i'm sure @ryanstar will b...,done ! im sure will be happy today btw i cant ...
34632,1,"@aknednyt Hey you, yeah you. get on msn please.","Hey you , yeah you . get on msn please ."


In [11]:
data_df["sentimental_id"], sentiment_mappings = data_df["Sentiment"].factorize()
data_df.head()

Unnamed: 0,Sentiment,Message,cleaned_message,sentimental_id
0,0,is so sad for my APL frie...,is so sad for my APL friend ... ... ... ... .,0
1,0,I missed the New Moon trail...,I missed the New Moon trailer ...,0
2,1,omg its already 7:30 :O,omg its already : : O,1
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,.. Omgaga . Im sooo im gunna CRy . Ive been at...,0
4,0,i think mi bf is cheating on me!!! ...,i think mi bf is cheating on me ! ! ! T_T,0


In [12]:
sentiment_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', 
    encoding='latin-1', ngram_range=(1, 2), 
    stop_words='english')

In [13]:
sentiment_features = sentiment_tfidf.fit_transform(data_df["cleaned_message"])

In [14]:
sentiment_label = data_df["sentimental_id"]

In [15]:
sentiment_lsvc_model = LinearSVC()

In [16]:
X_train, X_test, y_train, y_test, indices_train,indices_test = train_test_split(sentiment_features, sentiment_label, data_df.index, 
    test_size=0.2, random_state=10)

In [17]:
sentiment_lsvc_model.fit(X_train,y_train)

LinearSVC()

In [18]:
ypred = sentiment_lsvc_model.predict(X_test)

In [19]:
ypred

array([0, 0, 0, ..., 0, 1, 0])

In [20]:
print((classification_report(y_test,ypred)))

              precision    recall  f1-score   support

           0       0.72      0.68      0.70      3780
           1       0.73      0.76      0.74      4199

    accuracy                           0.72      7979
   macro avg       0.72      0.72      0.72      7979
weighted avg       0.72      0.72      0.72      7979



In [21]:
accuracy_score(y_test,ypred)

0.7220203032961524

In [22]:
pd.DataFrame(confusion_matrix(y_test,ypred))

Unnamed: 0,0,1
0,2574,1206
1,1012,3187


In [28]:
Test_message = ["guess what you have passed!!"]

In [29]:
transformed_message = sentiment_tfidf.transform(Test_message)

In [30]:
predicted_sentiment = sentiment_lsvc_model.predict(transformed_message)

In [31]:
print(predicted_sentiment)

[0]


In [32]:
print(sentiment_mappings.take(predicted_sentiment)[0])

0
