# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

# Importing data

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [4]:
df.columns

Index(['textID', 'text', 'selected_text', 'sentiment'], dtype='object')

# Exploring the data

In [5]:
# statistical description
df.describe()

Unnamed: 0,textID,text,selected_text,sentiment
count,27481,27480,27480,27481
unique,27481,27480,22463,3
top,cb774db0d1,"I`d have responded, if I were going",good,neutral
freq,1,1,199,11118


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


# checking for null values

In [7]:
df.isna().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

# Dropping the null value

In [8]:
df.dropna()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


# checking for unique values

In [9]:
df.nunique()

textID           27481
text             27480
selected_text    22463
sentiment            3
dtype: int64

In [201]:
df1 = df.loc[:, ["text", "sentiment"]]
df1

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


# Data Preprocessing

### Removing the qotations and other that not belong to alphabets using regex

In [206]:
import re

df1["clean_text"] = df1["text"].apply(lambda x: re.sub("[^a-zA-Z]", " ", str(x)))
df1

Unnamed: 0,text,sentiment,clean_text
0,"I`d have responded, if I were going",neutral,I d have responded if I were going
1,Sooo SAD I will miss you here in San Diego!!!,negative,Sooo SAD I will miss you here in San Diego
2,my boss is bullying me...,negative,my boss is bullying me
3,what interview! leave me alone,negative,what interview leave me alone
4,"Sons of ****, why couldn`t they put them on t...",negative,Sons of why couldn t they put them on t...
...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,wish we could come see u on Denver husband l...
27477,I`ve wondered about rake to. The client has ...,negative,I ve wondered about rake to The client has ...
27478,Yay good for both of you. Enjoy the break - y...,positive,Yay good for both of you Enjoy the break y...
27479,But it was worth it ****.,positive,But it was worth it


### Converting the text to lower case

In [207]:
df1["clean_text"] = df1["clean_text"].apply(lambda x: x.lower())
df1

Unnamed: 0,text,sentiment,clean_text
0,"I`d have responded, if I were going",neutral,i d have responded if i were going
1,Sooo SAD I will miss you here in San Diego!!!,negative,sooo sad i will miss you here in san diego
2,my boss is bullying me...,negative,my boss is bullying me
3,what interview! leave me alone,negative,what interview leave me alone
4,"Sons of ****, why couldn`t they put them on t...",negative,sons of why couldn t they put them on t...
...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,wish we could come see u on denver husband l...
27477,I`ve wondered about rake to. The client has ...,negative,i ve wondered about rake to the client has ...
27478,Yay good for both of you. Enjoy the break - y...,positive,yay good for both of you enjoy the break y...
27479,But it was worth it ****.,positive,but it was worth it


### Removing the stopwords

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [14]:
eng_stopwords = stopwords.words("english")
eng_stopwords[:5]

['i', 'me', 'my', 'myself', 'we']

In [15]:
w = []
for i in df1["clean_text"]:
    for word in word_tokenize(str(i)):
        if word not in eng_stopwords:
            w.append(word)

w

['responded',
 'going',
 'sooo',
 'sad',
 'miss',
 'san',
 'diego',
 'boss',
 'bullying',
 'interview',
 'leave',
 'alone',
 'sons',
 'put',
 'releases',
 'already',
 'bought',
 'http',
 'www',
 'dothebouncy',
 'com',
 'smf',
 'shameless',
 'plugging',
 'best',
 'rangers',
 'forum',
 'earth',
 'feedings',
 'baby',
 'fun',
 'smiles',
 'coos',
 'soooo',
 'high',
 'journey',
 'wow',
 'u',
 'became',
 'cooler',
 'hehe',
 'possible',
 'much',
 'love',
 'hopeful',
 'reckon',
 'chances',
 'minimal',
 'p',
 'never',
 'gon',
 'na',
 'get',
 'cake',
 'stuff',
 'really',
 'really',
 'like',
 'song',
 'love',
 'story',
 'taylor',
 'swift',
 'sharpie',
 'running',
 'dangerously',
 'low',
 'ink',
 'want',
 'go',
 'music',
 'tonight',
 'lost',
 'voice',
 'test',
 'test',
 'lg',
 'env',
 'uh',
 'oh',
 'sunburned',
 'ok',
 'trying',
 'plot',
 'alternatives',
 'speak',
 'sigh',
 'sick',
 'past',
 'days',
 'thus',
 'hair',
 'looks',
 'wierd',
 'didnt',
 'hat',
 'would',
 'look',
 'http',
 'tinyurl',
 'co

In [16]:
len(w)

197759

In [210]:
# final code for removing the stop words (include this code alone in this part)
df1["clean_text"] = df1["clean_text"].apply(
    lambda x: " ".join(
        [word for word in word_tokenize(str(x)) if word not in eng_stopwords]
    )
)

In [211]:
df1

Unnamed: 0,text,sentiment,clean_text
0,"I`d have responded, if I were going",neutral,responded going
1,Sooo SAD I will miss you here in San Diego!!!,negative,sooo sad miss san diego
2,my boss is bullying me...,negative,boss bullying
3,what interview! leave me alone,negative,interview leave alone
4,"Sons of ****, why couldn`t they put them on t...",negative,sons put releases already bought
...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,wish could come see u denver husband lost job ...
27477,I`ve wondered about rake to. The client has ...,negative,wondered rake client made clear net force devs...
27478,Yay good for both of you. Enjoy the break - y...,positive,yay good enjoy break probably need hectic week...
27479,But it was worth it ****.,positive,worth


### Defining a function for a combined preprocessing

In [226]:
def preprocess(text):
    # importing libraries
    import re
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize

    # creating a variable consist of english stopwords
    eng_stopwords = stopwords.words("english")

    # removing punctuations
    text = re.sub("[^a-zA-Z]", " ", str(text))

    # converting to lower case
    text = text.lower()

    # removing the stopwords
    return " ".join([word for word in word_tokenize(text) if word not in eng_stopwords])

In [227]:
preprocess("HellOOO ?? hai are am i luv uuu")

'hellooo hai luv uuu'

In [219]:
test = df.loc[:, ["text", "sentiment"]]
test

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive
27479,But it was worth it ****.,positive


In [228]:
test["ct"] = test["text"].apply(preprocess)

In [229]:
test

Unnamed: 0,text,sentiment,ct
0,"I`d have responded, if I were going",neutral,responded going
1,Sooo SAD I will miss you here in San Diego!!!,negative,sooo sad miss san diego
2,my boss is bullying me...,negative,boss bullying
3,what interview! leave me alone,negative,interview leave alone
4,"Sons of ****, why couldn`t they put them on t...",negative,sons put releases already bought
...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,wish could come see u denver husband lost job ...
27477,I`ve wondered about rake to. The client has ...,negative,wondered rake client made clear net force devs...
27478,Yay good for both of you. Enjoy the break - y...,positive,yay good enjoy break probably need hectic week...
27479,But it was worth it ****.,positive,worth


----

# Sentiment Analysis

## 1) lexicon - based classification


In [295]:
from nltk.sentiment import SentimentIntensityAnalyzer


sia = SentimentIntensityAnalyzer()

In [22]:
df2 = df1.loc[:, ["clean_text", "sentiment"]]
df2.head()

Unnamed: 0,clean_text,sentiment
0,responded going,neutral
1,sooo sad miss san diego,negative
2,boss bullying,negative
3,interview leave alone,negative
4,sons put releases already bought,negative


In [296]:
df2["sentiment_score"] = df2["clean_text"].apply(
    lambda x: sia.polarity_scores(x)["compound"]
)

In [24]:
df2

Unnamed: 0,clean_text,sentiment,sentiment_score
0,responded going,neutral,0.0000
1,sooo sad miss san diego,negative,-0.5719
2,boss bullying,negative,-0.5994
3,interview leave alone,negative,-0.2960
4,sons put releases already bought,negative,0.0000
...,...,...,...
27476,wish could come see u denver husband lost job ...,negative,0.1027
27477,wondered rake client made clear net force devs...,negative,0.3818
27478,yay good enjoy break probably need hectic week...,positive,0.9136
27479,worth,positive,0.2263


### Classifying based on sentiment score


#### Getting the range of each true sentiments

In [318]:
px.histogram(
    df2[df2["sentiment"] == "positive"],
    x="sentiment_score",
    opacity=0.7,
    title="Distribution of true positive",
)

In [320]:
px.histogram(
    df2[df2["sentiment"] == "neutral"],
    x="sentiment_score",
    opacity=0.7,
    title="Distribution of true neutral",
)

In [321]:
px.histogram(
    df2[df2["sentiment"] == "negative"],
    x="sentiment_score",
    opacity=0.7,
    title="Distribution of true negative",
)

### Classifying the sentiments based on the range observed from the histograms

By using the approximate range of each sentiment from the above histograms, we can predict the sentiments from the sentiment score.

In [328]:
df2["pred_sentiment"] = df2["sentiment_score"].apply(
    lambda x: "positive" if x > 0.4 else "negative" if x < -0.1 else "neutral"
)

In [326]:
df2

Unnamed: 0,clean_text,sentiment,sentiment_score,pred_sentiment
0,responded going,neutral,0.0000,negative
1,sooo sad miss san diego,negative,-0.5719,negative
2,boss bullying,negative,-0.5994,negative
3,interview leave alone,negative,-0.2960,negative
4,sons put releases already bought,negative,0.0000,negative
...,...,...,...,...
27476,wish could come see u denver husband lost job ...,negative,0.1027,neutral
27477,wondered rake client made clear net force devs...,negative,0.3818,neutral
27478,yay good enjoy break probably need hectic week...,positive,0.9136,positive
27479,worth,positive,0.2263,neutral


We got a new column `pred_sentiment` consist of predicted sentiments.

### Getting the accuracy of our prediction

In [349]:
from sklearn.metrics import accuracy_score


lex_accuracy = accuracy_score(df2["sentiment"], df2["pred_sentiment"])
print(f"====== The accuracy score is {round(lex_accuracy,2)} ======")



### Comparing the true values with the predicted values.

In [331]:
px.box(df2, x="sentiment", y="sentiment_score", title="Box plot for true sentiments")

In [332]:
px.box(
    df2,
    x="pred_sentiment",
    y="sentiment_score",
    title="Box plot for predicted sentiments",
)

### Statistical summary of predicted and true values

In [36]:
df2.groupby(["sentiment"])["sentiment_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
negative,7781.0,-0.144958,0.40472,-0.9726,-0.4767,-0.1531,0.0516,0.9277
neutral,11118.0,0.144981,0.316339,-0.9313,0.0,0.0,0.3818,0.959
positive,8582.0,0.538789,0.303704,-0.891,0.4215,0.5994,0.7716,0.9826


In [37]:
df2.groupby(["pred_sentiment"])["sentiment_score"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
pred_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
negative,6163.0,-0.40212,0.215878,-0.9726,-0.5423,-0.4215,-0.2263,-0.0018
neutral,7991.0,0.008054,0.025737,0.0,0.0,0.0,0.0,0.1496
positive,13327.0,0.564401,0.199801,0.1513,0.4215,0.5719,0.7269,0.9826


### Confusion matrix

In [341]:
from sklearn.metrics import confusion_matrix

lex_cm = confusion_matrix(
    df2["sentiment"], df2["pred_sentiment"], labels=["negative", "neutral", "positive"]
)
lex_cm

array([[4162, 2621,  998],
       [1148, 7212, 2758],
       [ 285, 1489, 6808]], dtype=int64)

In [342]:
px.imshow(
    lex_cm,
    text_auto=True,
    x=["negative", "neutral", "positive"],
    y=["negative", "neutral", "positive"],
    title="Confusion matrix for lexicon based classification",
)

This confusion matrix gives the number of true predictions and false predictions.

---

# Using machine learning models

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [343]:
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


### Feature selection

In [43]:
df3 = df1.loc[:, ["clean_text", "sentiment"]]
df3

Unnamed: 0,clean_text,sentiment
0,responded going,neutral
1,sooo sad miss san diego,negative
2,boss bullying,negative
3,interview leave alone,negative
4,sons put releases already bought,negative
...,...,...
27476,wish could come see u denver husband lost job ...,negative
27477,wondered rake client made clear net force devs...,negative
27478,yay good enjoy break probably need hectic week...,positive
27479,worth,positive


### splitting test and train dataset

In [71]:
x_train, x_test, y_train, y_test = train_test_split(
    df3["clean_text"], df3["sentiment"], test_size=0.2, random_state=24
)

In [72]:
print(len(x_train))
print(len(x_test))

21984
5497


### Vectorization

- Since the data is in string format, computers couldn't able to understand them. So we use some techniques to make the data understandable by computers. One such technique is `vectorization`.

- Vectorization is the process of converting text data into numerical vectors. This allows computers to process and understand text data in a way that is similar to how they process numerical data.

- In this project, TF-IDF (Term Frequency-Inverse Document Frequency) vectorization is used.

- TF-IDF is a more sophisticated vectorization technique that assigns a weight to each word in a document, based on the frequency of the word in the document and the frequency of the word in the entire corpus.

In [345]:
vector = TfidfVectorizer()
x_train_vec = vector.fit_transform(x_train)
x_test_vec = vector.transform(x_test)

In [74]:
x_train_vec.toarray().shape

(21984, 21422)

In [75]:
x_test_vec.toarray().shape

(5497, 21422)

The train and the test input data are vectorised.

----

## Model training

- In this project, we use <b>Support vector machines (SVMs)</b> to identify sentiment.

- <b>Support vector machines (SVMs)</b> are a type of machine learning algorithm that can be used for classification and regression tasks. SVMs work by finding the hyperplane that best separates the different classes of data.

- In the case of <b>linear SVMs</b>, the hyperplane is a straight line.

In [358]:
from sklearn.svm import SVC

svm = SVC(kernel="linear", probability=True)

# Fitting our data to the model.
svm.fit(x_train_vec, y_train)

In [365]:
# Predicting the sentiments using the trained model
y_pred = svm.predict(x_test_vec)
y_pred[:10]

array(['neutral', 'neutral', 'negative', 'positive', 'positive',
       'positive', 'neutral', 'positive', 'neutral', 'neutral'],
      dtype=object)

In [366]:
y_test.head(10)

5102      neutral
1084      neutral
5794     negative
20581    positive
1376     positive
20173    positive
9011      neutral
19922    positive
24839     neutral
18058     neutral
Name: sentiment, dtype: object

### Getting the accuracy score

In [367]:
svc_accuracy = accuracy_score(y_test, y_pred)
print(f"====== The accuracy score is {round(svc_accuracy,2)} ======")



### Confusion matrix

In [368]:
svc_cm = confusion_matrix(y_test, y_pred, labels=["negative", "neutral", "positive"])
svc_cm

array([[ 919,  511,   86],
       [ 278, 1738,  262],
       [  61,  407, 1235]], dtype=int64)

In [369]:
px.imshow(
    svc_cm,
    x=["negative", "neutral", "positive"],
    y=["negative", "neutral", "positive"],
    text_auto=True,
)

This confusion matrix gives the number of true predictions and false predictions.

AUC ROC

In [372]:
predicted_probabilities = svm.predict_proba(x_test_vec)

In [375]:
predicted_probabilities

array([[0.19931035, 0.74883569, 0.05185396],
       [0.35752168, 0.59923138, 0.04324694],
       [0.61631196, 0.33781344, 0.0458746 ],
       ...,
       [0.05788588, 0.81500461, 0.12710951],
       [0.42192381, 0.44131815, 0.13675804],
       [0.26229232, 0.59243262, 0.14527506]])

In [376]:
y_test

5102      neutral
1084      neutral
5794     negative
20581    positive
1376     positive
           ...   
6328     negative
5656     positive
8979      neutral
26499    negative
27431     neutral
Name: sentiment, Length: 5497, dtype: object

In [373]:
from sklearn.metrics import roc_auc_score

auc_roc = roc_auc_score(y_test, predicted_probabilities, multi_class="ovr")

In [374]:
auc_roc

0.8575330121069906

---

In [371]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["sentiment"], test_size=0.2, random_state=42
)

# Extract features using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train an SVM classifier
svm = SVC(kernel="linear")
svm.fit(X_train_vec, y_train)
fffff
# Predict sentiment on the test set
y_pred = svm.predict(X_test_vec)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

KeyError: 'clean_text'

In [377]:
testing = pd.read_csv("test.csv")

In [378]:
testing

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive
...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative
3530,416863ce47,All alone in this old house again. Thanks for...,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive


In [379]:
testing["clean_text"] = testing["text"].apply(preprocess)

In [380]:
testing

Unnamed: 0,textID,text,sentiment,clean_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,last session day http twitpic com ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,shanghai also really exciting precisely skyscr...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,recession hit veronique branquinho quit compan...
3,01082688c6,happy bday!,positive,happy bday
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,http twitpic com w p like
...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,im tired sleep try
3530,416863ce47,All alone in this old house again. Thanks for...,positive,alone old house thanks net keeps alive kicking...
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,know mean little dog sinking depression wants ...
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,sutra next youtube video gon na love videos


In [381]:
testing["sentiment_score"] = testing["clean_text"].apply(
    lambda x: sia.polarity_scores(x)["compound"]
)

In [383]:
testing["pred_sentiment"] = testing["sentiment_score"].apply(
    lambda x: "positive" if x > 0.4 else "negative" if x < -0.1 else "neutral"
)

In [384]:
testing

Unnamed: 0,textID,text,sentiment,clean_text,sentiment_score,pred_sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,last session day http twitpic com ezh,0.0000,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,shanghai also really exciting precisely skyscr...,0.7501,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,recession hit veronique branquinho quit compan...,-0.7096,negative
3,01082688c6,happy bday!,positive,happy bday,0.5719,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,http twitpic com w p like,0.3612,neutral
...,...,...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,im tired sleep try,-0.4404,negative
3530,416863ce47,All alone in this old house again. Thanks for...,positive,alone old house thanks net keeps alive kicking...,0.7430,positive
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,know mean little dog sinking depression wants ...,-0.5325,negative
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,sutra next youtube video gon na love videos,0.6369,positive


In [385]:
accuracy_score(testing["sentiment"], testing["pred_sentiment"])

0.6740237691001698