### Data Pre-Processing

In [39]:
pip install nltk



In [40]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [41]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
df = pd.read_csv('twitter_sentiment_data.csv')

In [43]:
df = df.drop('tweetid', axis=1)

In [44]:
df.head()

Unnamed: 0,sentiment,message
0,-1,@tiniebeany climate change is an interesting h...
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...
3,1,RT @Mick_Fanning: Just watched this amazing do...
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ..."


In [45]:
def clean_tweet(tweet):
    # Remove special characters, links, and hashtags
    tweet = re.sub(r'[^a-zA-Z0-9\s]', '', tweet)
    # Convert to lowercase
    tweet = tweet.lower()
    # Expand contractions (you may need a more extensive list)
    tweet = re.sub(r"can't", "cannot", tweet)
    tweet = re.sub(r"i've", "i have", tweet)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tweet = ' '.join(word for word in tweet.split() if word not in stop_words)
    # Tokenize the tweet
    tokens = tweet.split()
    # Remove mentions and usernames
    tokens = [word for word in tokens if not word.startswith('@')]
    # Join the tokens back to a single string
    tweet = ' '.join(tokens)
    return tweet

In [46]:
df['Cleaned Tweet'] = df['message'].apply(clean_tweet)
df.head()

Unnamed: 0,sentiment,message,Cleaned Tweet
0,-1,@tiniebeany climate change is an interesting h...,tiniebeany climate change interesting hustle g...
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,rt natgeochannel watch beforetheflood right le...
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,fabulous leonardo dicaprios film climate chang...
3,1,RT @Mick_Fanning: Just watched this amazing do...,rt mickfanning watched amazing documentary leo...
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",rt cnalive pranita biswasi lutheran odisha giv...


In [None]:
df = df.drop_duplicates(subset='Cleaned Tweet')

### Train Test Split

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

### Naive Bayes

In [None]:
df_nb = df
# Split the data into features (cleaned tweets) and labels (sentiments)
X = df_nb['Cleaned Tweet']
y = df_nb['sentiment']

# Convert the text data into numerical feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=59)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Create and train the Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

# Predict the sentiments for the testing set
y_pred = naive_bayes.predict(X_test)

In [None]:
# Evaluate the classifier's performance
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='macro')
recall = metrics.recall_score(y_test, y_pred, average='macro')
f1_score = metrics.f1_score(y_test, y_pred, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

### SVM

In [53]:
df_svm = df
# Split the data into features (cleaned tweets) and labels (sentiments)
X = df_svm['Cleaned Tweet']
y = df_svm['sentiment']

# Convert the text data into numerical feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=59)

In [54]:
from sklearn.svm import SVC

In [55]:
# Create and train the SVM classifier
svm = SVC()
svm.fit(X_train, y_train)

# Predict the sentiments for the testing set
y_pred = svm.predict(X_test)

In [None]:
# Evaluate the classifier's performance
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='macro')
recall = metrics.recall_score(y_test, y_pred, average='macro')
f1_score = metrics.f1_score(y_test, y_pred, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

: 

### C4.5

In [57]:
df_c4_5 = df
# Split the data into features (cleaned tweets) and labels (sentiments)
X = df_c4_5['Cleaned Tweet']
y = df_c4_5['sentiment']

# Convert the text data into numerical feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=59)

In [58]:
from sklearn.tree import DecisionTreeClassifier

In [59]:
# Create and train the C4.5 decision tree classifier
c4_5 = DecisionTreeClassifier()
c4_5.fit(X_train, y_train)

# Predict the sentiments for the testing set
y_pred = c4_5.predict(X_test)

In [60]:
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='macro')
recall = metrics.recall_score(y_test, y_pred, average='macro')
f1_score = metrics.f1_score(y_test, y_pred, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Accuracy: 0.6152813377273282
Precision: 0.5642714450591857
Recall: 0.5275893513222524
F1-Score: 0.541441119559714


### Random Forest

In [61]:
df_rf = df
# Split the data into features (cleaned tweets) and labels (sentiments)
X = df_rf['Cleaned Tweet']
y = df_rf['sentiment']

# Convert the text data into numerical feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=59)

In [62]:
from sklearn.ensemble import RandomForestClassifier

In [63]:
# Create and train the Random Forest classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

# Predict the sentiments for the testing set
y_pred = random_forest.predict(X_test)

In [64]:
# Evaluate the classifier's performance
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='macro')
recall = metrics.recall_score(y_test, y_pred, average='macro')
f1_score = metrics.f1_score(y_test, y_pred, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Accuracy: 0.6796045404613694
Precision: 0.6846616482866903
Recall: 0.5506408314700717
F1-Score: 0.5830926139754555


### XGBoost

In [68]:
df_xgb = df

# Map sentiment values to the expected classes
sentiment_mapping = {-1: 0, 0: 1, 1: 2, 2: 3}
df_xgb['sentiment'] = df_xgb['sentiment'].map(sentiment_mapping)
# Split the data into features (cleaned tweets) and labels (sentiments)
X = df_xgb['Cleaned Tweet']
y = df_xgb['sentiment']

# Convert the text data into numerical feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=59)

In [69]:
import xgboost as xgb

In [70]:
# Create and train the XGBoost classifier
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train, y_train)

# Predict the sentiments for the testing set
y_pred = xgb_classifier.predict(X_test)

In [71]:
# Evaluate the classifier's performance
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='macro')
recall = metrics.recall_score(y_test, y_pred, average='macro')
f1_score = metrics.f1_score(y_test, y_pred, average='macro')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Accuracy: 0.656536067374588
Precision: 0.6793415125377642
Recall: 0.5217626604776062
F1-Score: 0.5626785601705536
