# Text Sentiment Analysis

In [1]:
!pip install scikit-learn   



In [2]:
!pip install -U spacy



In [3]:
!python -m spacy download en

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [1]:
import spacy    # spacy is used for natural language processing
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_sm')    # it will return a language object needed to process text

In [3]:
from spacy.lang.en.stop_words import STOP_WORDS  

In [4]:
stopwords = list(STOP_WORDS)   # stopwords => commonly used words
print(stopwords)

['itself', '’ll', 'further', 'twelve', 'who', 'before', 'very', 'whatever', 'yet', 'your', 'much', 'quite', 'do', 'whereby', 'say', 'neither', 'upon', 'mostly', 'when', 'you', 'five', 'between', 'the', 'hereafter', 'several', 'anyone', 'must', "'m", 'regarding', 'beside', 'same', 'either', 'yourselves', 'doing', 'during', "'ll", 'that', 'we', 'most', 'whoever', 'alone', 'indeed', 'rather', 'nine', 'becoming', 'whole', '‘ve', 'wherever', 'they', 'twenty', 'without', 'n’t', 'which', 'thereby', 'due', 'always', '’d', 'and', 'somehow', 'but', 'up', 'hence', 'another', 'elsewhere', 'she', 'only', 'various', 'whither', 'him', 'seemed', 'was', '‘s', 'over', 'from', 'can', 'one', 'almost', 'done', 'enough', 'after', 'three', 'an', 'other', 'above', 'move', 'besides', 'by', 'nevertheless', 'least', 'ca', 'yours', 'throughout', 'been', 'any', 'serious', 'part', 'me', 'put', '‘ll', 'while', 'because', 'what', 'down', 'bottom', "'s", 'such', 'front', 'in', 'unless', 'per', 'does', 'should', 'canno

In [5]:
import pandas as pd

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

###### We will be performing sentiment analysis on twitter dataset 

In [7]:
twitter_data = pd.read_csv('twitter_data.csv')  #reading the contents of the twitter dataset
twitter_data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [8]:
columns_name = ['Id', 'Sentiment', 'Tweet']
twitter_data.columns = columns_name

In [9]:
twitter_data.head()

Unnamed: 0,Id,Sentiment,Tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [10]:
twitter_data.shape

(31962, 3)

In [11]:
twitter_data['Sentiment'].value_counts()

0    29720
1     2242
Name: Sentiment, dtype: int64

In [12]:
twitter_data.isnull().sum()

Id           0
Sentiment    0
Tweet        0
dtype: int64

In [13]:
t_data = pd.DataFrame(data = twitter_data)
t_data.loc[0].Tweet

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

### Tokenization

In [14]:
import string

In [15]:
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [16]:
def data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
        
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [17]:
data_cleaning(t_data.loc[0].Tweet)

['@user',
 'father',
 'dysfunctional',
 'selfish',
 'drag',
 'kid',
 'dysfunction',
 'run']

### Part of Speech (POS) Tagging

In [18]:
doc = nlp(t_data.loc[0].Tweet)
doc

 @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run

In [19]:
for token in doc:
    print(token.text, token.lemma_)

   
@user @user
when when
a a
father father
is be
dysfunctional dysfunctional
and and
is be
so so
selfish selfish
he he
drags drag
his his
kids kid
into into
his his
dysfunction dysfunction
. .
     
# #
run run


In [20]:
for token in doc:
    print(f'{token.text:{15}} {token.lemma_:{15}}{token.pos_:{15}}{token.is_stop}')

                               SPACE          False
@user           @user          ADV            False
when            when           ADV            True
a               a              DET            True
father          father         NOUN           False
is              be             VERB           True
dysfunctional   dysfunctional  ADJ            False
and             and            CCONJ          True
is              be             VERB           True
so              so             ADV            True
selfish         selfish        ADJ            False
he              he             PRON           True
drags           drag           VERB           False
his             his            PRON           True
kids            kid            NOUN           False
into            into           ADP            True
his             his            PRON           True
dysfunction     dysfunction    NOUN           False
.               .              PUNCT          False
                      

### Vectorization Feature Engineering (TF-IDF)

##### Logistic Regression 

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
tfidf = TfidfVectorizer(tokenizer = data_cleaning)
classifier = LogisticRegression()

In [23]:
data = twitter_data['Tweet']
target = twitter_data['Sentiment']

In [24]:
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size = 0.2, random_state = 42, stratify = twitter_data['Sentiment'])

In [25]:
data_train.shape, data_test.shape

((25569,), (6393,))

In [26]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [27]:
clf.fit(data_train, target_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function data_cleaning at 0x000002BC24E1B0D0>)),
                ('clf', LogisticRegression())])

##### Support Vector Machine (SVM)

In [28]:
from sklearn.svm import SVC

In [29]:
clf2 = Pipeline([('tfidf',TfidfVectorizer()),('clf',SVC(C = 1, gamma = 'auto'))])

In [30]:
clf2.fit(data_train, target_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', SVC(C=1, gamma='auto'))])

##### Random Forest Classifier

In [31]:
from sklearn.ensemble import RandomForestClassifier
clf3 = Pipeline([('tfidf',TfidfVectorizer()),('clf',RandomForestClassifier(n_estimators=100,n_jobs=-1))])

In [32]:
clf3.fit(data_train, target_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', RandomForestClassifier(n_jobs=-1))])