<a href="https://colab.research.google.com/github/SIDEYS/Tweet-Sentiment-Analysis/blob/main/Tweets_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Feature Extractiom from tweets

In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Preprocessing

In [None]:
tw_df = pd.read_csv('/content/Tweets.csv')

In [None]:
tw_df.shape

(27481, 4)

In [None]:
tw_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
tw_df.tail()

Unnamed: 0,textID,text,selected_text,sentiment
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive
27480,6f7127d9d7,All this flirting going on - The ATG smiles...,All this flirting going on - The ATG smiles. Y...,neutral


In [None]:
# counting the number of missing values in the dataset
tw_df.isnull().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [None]:
# replacing the null values with empty string(1 value each missing from text and selected text)
tw_df = tw_df.fillna('')

In [None]:
tw_df.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [None]:
tw_df['sentiment'].value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

Stemming: reducing a word to its Root word

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(text):
    stemmed_tweets = re.sub('[^a-zA-Z]',' ',text)   # Removing all the punctuations, numbers etc and only keeping text
    stemmed_tweets = stemmed_tweets.lower()         # Coverting text to lowercase letters
    stemmed_tweets = stemmed_tweets.split()         # Splitting the words to apply stemming
    stemmed_tweets = [port_stem.stem(word) for word in stemmed_tweets    # Stemming of all the words except Stopwords
                       if not word in stopwords.words('english')]
    stemmed_tweets = ' '.join(stemmed_tweets)        # Joining all the words after stemming
    return stemmed_tweets

Applying Stemming

In [None]:
tw_df['text'] = tw_df['text'].apply(stemming)

In [None]:
print(tw_df['text'])

0                                               respond go
1                                  sooo sad miss san diego
2                                               boss bulli
3                                      interview leav alon
4                            son put releas alreadi bought
                               ...                        
27476    wish could come see u denver husband lost job ...
27477    wonder rake client made clear net forc dev lea...
27478    yay good enjoy break probabl need hectic weeke...
27479                                                worth
27480                           flirt go atg smile yay hug
Name: text, Length: 27481, dtype: object


**Label Encoding**



*   Positive - 1
*   Neutral - 0
*   Negative - -1





In [None]:
# labelelling the sentiments with numeric values

tw_df.loc[tw_df['sentiment'] == 'positive', 'sentiment',] = 1
tw_df.loc[tw_df['sentiment'] == 'neutral', 'sentiment',] = 0
tw_df.loc[tw_df['sentiment'] == 'negative', 'sentiment',] = -1

In [None]:
#separating the data and label
X = tw_df['text'].values
Y = tw_df['sentiment'].values

In [None]:
print(X)

['respond go' 'sooo sad miss san diego' 'boss bulli' ...
 'yay good enjoy break probabl need hectic weekend take care hun xxxx'
 'worth' 'flirt go atg smile yay hug']


In [None]:
print(Y)

[0 -1 -1 ... 1 1 0]


Splitting the data into training data & test

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(27481,)
(21984,)
(5497,)


## **Feature Extraction**

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [None]:
# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

{'best': 1371, 'vanilla': 15795, 'memori': 9258, 'sharffenberg': 13056, 'factori': 4880, 'gift': 5773, 'shop': 13166, 'sure': 14304, 'get': 5740, 'hey': 6627, 'back': 1021, 'robluket': 12443, 'live': 8552, 'http': 6904, 'ustr': 15735, 'lost': 8700, 'internet': 7303, 'raid': 11945, 'os': 10661, 'love': 8728, 'read': 12038, 'film': 5083, 'make': 8945, 'work': 16599, 'late': 8281, 'yet': 16913, 'fun': 5512, 'hear': 6508, 'colleagu': 2888, 'funni': 5528, 'song': 13629, 'list': 8535, 'ohhh': 10462, 'mannnn': 9006, 'laid': 8209, 'girl': 5800, 'economi': 4341, 'improv': 7149, 'soon': 13640, 'hiatu': 6648, 'like': 8490, 'realli': 12054, 'long': 8642, 'boo': 1656, 'cherri': 2556, 'italian': 7398, 'ice': 7026, 'fave': 4961, 'want': 16084, 'local': 8598, 'rita': 12410, 'twitter': 15397, 'send': 12944, 'daili': 3447, 'flavor': 5177, 'summer': 14230, 'forget': 5309, 'peopl': 11063, 'tomorrow': 15008, 'mother': 9691, 'day': 3536, 'think': 14769, 'that': 14713, 'go': 5881, 'didnt': 3827, 'time': 1488

In [None]:
vectorizer.get_feature_names_out()

array(['aa', 'aaa', 'aaaa', ..., 'zzzz', 'zzzzi', 'zzzzzzz'], dtype=object)

In [None]:
print(X_train_features.shape)
print(X_train_features.toarray())

(21984, 17133)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
print(X_train_features)

  (0, 5740)	0.16539346775521319
  (0, 14304)	0.24816316187328824
  (0, 13166)	0.2758827584583102
  (0, 5773)	0.3259803813391597
  (0, 4880)	0.41872018692634927
  (0, 13056)	0.4595853050997152
  (0, 9258)	0.3526429665148487
  (0, 15795)	0.39775876804158883
  (0, 1371)	0.2440721071543755
  (1, 15735)	0.5245587498644139
  (1, 6904)	0.2586598816772902
  (1, 8552)	0.3546195810918415
  (1, 12443)	0.5777295123703583
  (1, 1021)	0.27791154862633305
  (1, 6627)	0.34809756956066445
  (2, 10661)	0.5767866762163607
  (2, 11945)	0.5882092892806138
  (2, 7303)	0.4168986522676629
  (2, 8700)	0.3840865473831187
  (3, 8945)	0.35252136410400814
  (3, 5083)	0.5527641712447573
  (3, 12038)	0.44925681310845067
  (3, 8728)	0.6069174364886749
  (4, 8535)	0.37372106510771097
  (4, 13629)	0.3117161340757922
  :	:
  (21980, 13840)	0.40909194306345337
  (21980, 3410)	0.40909194306345337
  (21980, 11128)	0.3815750295575855
  (21980, 12197)	0.3815750295575855
  (21980, 2850)	0.3029322137132459
  (21980, 640)	0.279

In [None]:
print(X_test_features)

  (0, 9691)	0.4795029593151824
  (0, 9585)	0.6701272007953031
  (0, 6371)	0.44130441453386177
  (0, 3536)	0.3553264139846064
  (1, 11862)	0.5291203724255016
  (1, 11295)	0.3957041991754114
  (1, 9027)	0.5020918509620326
  (1, 8728)	0.2477081005602444
  (1, 5978)	0.2665429415156915
  (1, 5881)	0.21969156829035752
  (1, 418)	0.36150347647551967
  (2, 14769)	0.41073119671305935
  (2, 8848)	0.5581368672904686
  (2, 8490)	0.36912325721786005
  (2, 6904)	0.38118537244109246
  (2, 1475)	0.4880869330720329
  (3, 16856)	0.2544579656895602
  (3, 14889)	0.4204811835672199
  (3, 13981)	0.39329368323566477
  (3, 13331)	0.31661888370260494
  (3, 11877)	0.4752906634550235
  (3, 7068)	0.3887952603689301
  (3, 5083)	0.3556069598869249
  (4, 16856)	0.26159484808039907
  (4, 14533)	0.3954612113898112
  :	:
  (5493, 15505)	0.44775069335730705
  (5493, 14800)	0.2935514669166868
  (5493, 9054)	0.4452335410637227
  (5493, 8083)	0.24613000886299996
  (5493, 8015)	0.37256199494438264
  (5493, 7985)	0.342154276

In [None]:
print(Y_train)

[ 1  0 -1 ... -1  1  0]


In [None]:
print(Y_test)

[ 1  1  1 ... -1  1  0]


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [None]:
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression()

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()
# model.fit(X_train_features, Y_train)

In [None]:
# model = SVC()
# model.fit(X_train_features, Y_train)

In [None]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [None]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9369996360989811


In [None]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test_features)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)


In [None]:
print('Accuracy score of the test data : ', test_data_accuracy)


Accuracy score of the test data :  0.6971075131890122


In [None]:
# print('F1 score of the test data : ', test_data_f1_score)
# test_data_f1_score = f1_score(X_test_prediction, Y_test)