In [None]:
# Install required libraries
!pip install gdown
!pip install beautifulsoup4
!pip install textblob
!pip install scikit-learn



In [1]:
# Import necessary libraries
import gdown
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

In [2]:
# Download the dataset
file_url = 'https://drive.google.com/uc?id=145OF30g7TfEATc-fUAIQotyuGahAZ6CF'
output_file = 'twittersentiment.csv'
gdown.download(file_url, output_file, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=145OF30g7TfEATc-fUAIQotyuGahAZ6CF
To: /content/twittersentiment.csv
100%|██████████| 10.5M/10.5M [00:00<00:00, 43.5MB/s]


'twittersentiment.csv'

In [3]:
# Load spaCy model
import en_core_web_sm
nlp = en_core_web_sm.load()

In [4]:
# Necessary Functions

# Function to remove emails from text
def remove_emails(x):
    return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)', "", x)

# Function to remove URLs from text
def remove_urls(x):
    return re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x)

# Function to remove HTML tags from text
def remove_html_tags(x):
    return BeautifulSoup(x, 'lxml').get_text().strip()

# Function to remove special characters from text
def remove_special_chars(x):
    x = re.sub(r'[^\w ]+', "", x)
    x = ' '.join(x.split())
    return x

# Function to remove 'RT' (retweet) from text
def remove_rt(x):
    return re.sub(r'\brt\b', '', x).strip()


In [5]:
# Load the dataset
df = pd.read_csv('twittersentiment.csv', header=None, index_col=[0])
df = df[[2, 3]].reset_index(drop=True)
df.columns = ['sentiment', 'text']

In [6]:
df.head()

Unnamed: 0,sentiment,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [7]:
# Drop NaN values and texts with length less than 1
df.dropna(inplace=True)
df = df[df['text'].apply(len) > 1]

In [8]:
# Define stopwords
stopwords = set(nlp.Defaults.stop_words)


# Function to get basic text features
def get_basic_features(df, stopwords):
    df['char_counts'] = df['text'].apply(lambda x: len(x))
    df['word_counts'] = df['text'].apply(lambda x: len(x.split()))
    df['avg_wordlength'] = df.apply(lambda x: len(x['text'].replace(" ", "")) / len(x['text'].split()) if len(x['text'].split()) != 0 else 0, axis=1)
    df['stopwords_counts'] = df['text'].apply(lambda x: len([t for t in x.split() if t in stopwords]))
    df['hashtag_counts'] = df['text'].apply(lambda x: len([t for t in x.split() if t.startswith('#')]))
    df['mentions_counts'] = df['text'].apply(lambda x: len([t for t in x.split() if t.startswith('@')]))
    df['digits_counts'] = df['text'].apply(lambda x: len(re.findall(r'[0-9,.]+', x)))
    df['uppercase_counts'] = df['text'].apply(lambda x: len([t for t in x.split() if t.isupper()]))
    return df


# Data cleaning
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply(remove_urls)
df['text'] = df['text'].apply(remove_html_tags)
df['text'] = df['text'].apply(remove_special_chars)
df['text'] = df['text'].apply(remove_rt)

# Get basic features
df = get_basic_features(df, stopwords)


  return BeautifulSoup(x, 'lxml').get_text().strip()


In [9]:
df.head()

Unnamed: 0,sentiment,text,char_counts,word_counts,avg_wordlength,stopwords_counts,hashtag_counts,mentions_counts,digits_counts,uppercase_counts
0,Positive,im getting on borderlands and i will murder yo...,51,10,4.2,6,0,0,0,0
1,Positive,i am coming to the borders and i will kill you...,50,12,3.25,9,0,0,0,0
2,Positive,im getting on borderlands and i will kill you all,49,10,4.0,6,0,0,0,0
3,Positive,im coming on borderlands and i will murder you...,50,10,4.1,6,0,0,0,0
4,Positive,im getting on borderlands 2 and i will murder ...,56,12,3.75,7,0,0,1,0


In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

In [11]:
# Model building
tfidf_vectorizer = TfidfVectorizer(stop_words=list(stopwords))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(X_train_tfidf, y_train)




In [12]:
# Evaluation
predictions = clf.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, predictions))



Accuracy: 0.9058271935699933


In [13]:
# Save model
with open('twitter_sentiment.pkl', 'wb') as f:
    pickle.dump(clf, f)