# Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

# Load Dataset

In [2]:
df = pd.read_csv("Tweets_Dataset.csv")

In [3]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
df.tail()

Unnamed: 0,textID,text,selected_text,sentiment
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive
27480,6f7127d9d7,All this flirting going on - The ATG smiles...,All this flirting going on - The ATG smiles. Y...,neutral


### Checking the size of data i.e.(rows and columns)

In [5]:
df.shape

(27481, 4)

### counting the missing values in the dataset

In [6]:
df.isnull().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

We can clearly see there is 1 null value in "text" column and 1 null value in "selected_text"

### Removing the null values 

In [7]:
df = df.dropna()

In [8]:
df.shape

(27480, 4)

In [9]:
df.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

Now, there is no any null values.

### Checking the value count of "sentiment" column

In [10]:
df["sentiment"].value_counts()

sentiment
neutral     11117
positive     8582
negative     7781
Name: count, dtype: int64

# Let's Perform Text Cleaning

### stemming the text

In [11]:
port_stem = PorterStemmer()

In [12]:
def clean_text(text):
    text = text.lower() # converting the text into lower case
    text = re.sub(r'https?://\S+|www\.\S+',' ',text) # remove URL from the text
    text = re.sub(r'\*+', '****', text) # Replace censored words (****) with "****"
    text = re.sub(r'[^a-zA-Z\s****]', ' ', text) # Remove digits & symbols except spaces and letters  
    text = text.split() # Splitting the text into list
    text = [port_stem.stem(word) for word in text if not word in stopwords.words("english")] # removing the stopwords from the text and converting the rest text into their word
    text = " ".join(text)
    return text

In [13]:
df["cleaned_text"] = df["text"].apply(clean_text) 

In [14]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,cleaned_text
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,respond go
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego
2,088c60f138,my boss is bullying me...,bullying me,negative,boss bulli
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leav alon
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son **** put releas alreadi bought


In [15]:
df.tail()

Unnamed: 0,textID,text,selected_text,sentiment,cleaned_text
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish could come see u denver husband lost job ...
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,wonder rake client made clear net forc dev lea...
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good enjoy break probabl need hectic weeke...
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth ****
27480,6f7127d9d7,All this flirting going on - The ATG smiles...,All this flirting going on - The ATG smiles. Y...,neutral,flirt go atg smile yay hug


# Encoding

In [16]:
label_mapping = {"negative":-1,"neutral":0 ,"positive":1}
df['target'] = df['sentiment'].map(lambda x: label_mapping[x])

In [17]:
df

Unnamed: 0,textID,text,selected_text,sentiment,cleaned_text,target
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,respond go,0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,sooo sad miss san diego,-1
2,088c60f138,my boss is bullying me...,bullying me,negative,boss bulli,-1
3,9642c003ef,what interview! leave me alone,leave me alone,negative,interview leav alon,-1
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,son **** put releas alreadi bought,-1
...,...,...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative,wish could come see u denver husband lost job ...,-1
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative,wonder rake client made clear net forc dev lea...,-1
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive,yay good enjoy break probabl need hectic weeke...,1
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive,worth ****,1


In [18]:
# Separating the dependent and independent columns
X = df['cleaned_text'].values
Y = df['target'].values

In [19]:
print(X)

['respond go' 'sooo sad miss san diego' 'boss bulli' ...
 'yay good enjoy break probabl need hectic weekend take care hun xxxx'
 'worth ****' 'flirt go atg smile yay hug']


In [20]:
print(Y)

[ 0 -1 -1 ...  1  1  0]


In [21]:
# Splitting the data into Train and test
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2,stratify=Y, random_state=42)

In [22]:
print(X.shape, X_train.shape, X_test.shape)

(27480,) (21984,) (5496,)


In [23]:
print(Y.shape, Y_train.shape, Y_test.shape)

(27480,) (21984,) (5496,)


In [24]:
# Converting the textual data into numerical data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [25]:
# Training the model
model = SVC(kernel="linear", probability=True,random_state=42)
model.fit(X_train,Y_train)

In [26]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 146777 stored elements and shape (21984, 16374)>
  Coords	Values
  (0, 1721)	0.43649767868757317
  (0, 3271)	0.5292298759415468
  (0, 7860)	0.501921012985244
  (0, 11134)	0.4508391371132796
  (0, 11901)	0.2724057343046852
  (1, 251)	0.3321892497864948
  (1, 3468)	0.1711940440707862
  (1, 4868)	0.22228591470688278
  (1, 5734)	0.3002311147367343
  (1, 6017)	0.28823108725455726
  (1, 8166)	0.19639714832231647
  (1, 8379)	0.4150466266404689
  (1, 9841)	0.4048274498097983
  (1, 11721)	0.3262364473624006
  (1, 12876)	0.24816071729148392
  (1, 14378)	0.20530796854453826
  (1, 15597)	0.2298474763652387
  (2, 2870)	0.3727582740544342
  (2, 3468)	0.14911235653035718
  (2, 9211)	0.3389588409066158
  (2, 9296)	0.3727582740544342
  (2, 9553)	0.4417513380569676
  (2, 9747)	0.1973142324558008
  (2, 14067)	0.39465624519227027
  (2, 16294)	0.4417513380569676
  :	:
  (21979, 4026)	0.4247398927361659
  (21979, 9039)	0.38197694758775047
  (2197

# Model Evaluation 

## Accuracy Score

In [27]:
# accuracy score on the training data
Y_pred = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, Y_pred)

In [28]:
print(f"Accuracy score on the training data: {round(training_data_accuracy,4)*100:.2f}%" )

Accuracy score on the training data: 82.32%


In [29]:
# accuracy score on the testing data
Y_test_pred = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test,Y_test_pred)

In [30]:
print(f"Accuracy score on the test data: {round(test_data_accuracy,4)*100:.2f}%" )

Accuracy score on the test data: 71.45%


Model Accuracy is 71.45%

In [35]:
print(classification_report(Y_train,Y_pred))

              precision    recall  f1-score   support

          -1       0.86      0.75      0.80      6225
           0       0.77      0.87      0.82      8894
           1       0.87      0.82      0.85      6865

    accuracy                           0.82     21984
   macro avg       0.83      0.82      0.82     21984
weighted avg       0.83      0.82      0.82     21984



In [34]:
print(classification_report(Y_test,Y_test_pred))

              precision    recall  f1-score   support

          -1       0.74      0.64      0.69      1556
           0       0.66      0.76      0.70      2223
           1       0.79      0.72      0.75      1717

    accuracy                           0.71      5496
   macro avg       0.73      0.71      0.72      5496
weighted avg       0.72      0.71      0.71      5496



# Making Prediction

In [31]:
text = "I love my country"
user_input = [text]  # Wrap text in list of lists
vectorize_user_input = vectorizer.transform(user_input)  # Vectorize input text
prediction = model.predict(vectorize_user_input)  # Make prediction

# print("Prediction:", prediction[0])
if prediction[0] == 1:
    print("Positive")
elif prediction[0] == -1:
    print("Neutral")
else:
    print("Negative")

Positive


In [32]:
import joblib
joblib.dump(model,"twitter_sentiment_analysis_model.pkl")
print("Successfully Exported 'twitter_sentiment_analysis_model.pkl' ")

Successfully Exported 'twitter_sentiment_analysis_model.pkl' 


In [33]:
joblib.dump(vectorizer,"vectorizer.pkl")
print("Successfully Exported 'vectorizer.pkl' ")

Successfully Exported 'vectorizer.pkl' 
