### Importing all the required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from geotext import GeoText

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

#nltk.download("stopwords")
#nltk.download("punkt")
#nltk.download("wordnet")

### Importing the data from csv file

In [2]:
data = pd.read_csv("tweets_v8.csv")

In [3]:
data.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,source,is_retweet
0,the _ûndër-ratèd niggáh👊🏾,,@ManUtd die hard❤️❤️💪🏿💪🏿\n\n\nYOLO\n\n\nJ'ai b...,2019-09-06 19:24:57+00:00,581,1035,8922,False,2021-10-06 12:05:38+00:00,When life hits and the same time poverty strik...,Twitter for Android,False
1,Best uncle on planet earth,,,2013-05-08 19:35:26+00:00,741,730,8432,False,2021-10-06 12:05:22+00:00,That marble episode of #SquidGame ruined me. 😭😭😭,Twitter for Android,False
2,marcie,,animal crossing. chicken nuggets. baby yoda. s...,2009-02-21 10:31:30+00:00,562,1197,62732,False,2021-10-06 12:05:22+00:00,#Squidgame time,Twitter Web App,False
3,YoMo.Mdp,Any pronouns,Where the heck is the karma\nI'm going on my s...,2021-02-14 13:21:22+00:00,3,277,1341,False,2021-10-06 12:05:04+00:00,//Blood on 1st slide\nI'm joining the squidgam...,Twitter Web App,False
4,Laura Reactions,France,I talk and I make reactions videos about shows...,2018-12-19 20:38:28+00:00,330,152,2278,False,2021-10-06 12:05:00+00:00,"The two first games, players were killed by th...",Twitter Web App,False


### Removing all the irrelavent columns and rows with null values

In [4]:
num_missing = data["user_location"].isna().sum()

In [5]:
data_cleaned = data.dropna(subset=['user_location'])

In [6]:
data_cleaned.shape

(56149, 12)

In [7]:
data_cleaned = data_cleaned.drop(['user_description', 'user_created', 'user_name', 'user_followers', 'user_friends', 'user_favourites',
                                'user_verified', 'date', 'source', 'is_retweet'], axis=1)

### Creating a function to validate a location using NER, Regex and GeoText

In [8]:
nlp = spacy.load("en_core_web_sm")

def is_valid_location(text):
    if pd.isnull(text) or len(text) < 2:
        return False
    
    if len(re.findall(r'[a-zA-Z]', text)) < 2:
        return False

    places = GeoText(text)
    if places.cities or places.countries:
        return True

    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC", "FAC"]:
            return True

    return False

### creating a function to clean text and applying it to the tweet text from dataset

In [9]:
def clean_text(text):
    if isinstance(text, str):  
        text = text.lower()  
        text = re.sub(r"http\S+|www\S+", "", text)  
        text = re.sub(r"@\w+", "", text)  
        text = re.sub(r"#\w+", "", text)  
        text = re.sub(r"[^a-zA-Z\s]", "", text)  
        text = re.sub(r"\s+", " ", text).strip()  
        
        words = word_tokenize(text)

        stop_words = set(stopwords.words("english"))
        words = [word for word in words if word not in stop_words]

        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

        return " ".join(words)
    return ""


In [10]:
data_cleaned["is_valid_location"] = data_cleaned["user_location"].apply(is_valid_location)
data_cleaned = data_cleaned[data_cleaned["is_valid_location"]].reset_index(drop=True)

In [11]:
#data_cleaned["cleaned_text"] = None
data_cleaned["cleaned_text"] = data_cleaned["text"].apply(clean_text)

### Creating TF-IDF vectors for the cleaned text

In [12]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectors = tfidf_vectorizer.fit_transform(data_cleaned['cleaned_text'])

print("TF-IDF shape:", tfidf_vectors.shape)

TF-IDF shape: (39275, 5000)


### Importing the training data and processing it to feed the model

In [13]:
train_data = pd.read_csv("tweet_sentiment.csv")
train_data["cleaned_text"] = train_data["cleaned_text"].fillna("")
tfidf_vectors_train = tfidf_vectorizer.fit_transform(train_data['cleaned_text'])
print("TF-IDF shape:", tfidf_vectors_train.shape)

X_tfidf_train = tfidf_vectors_train.toarray()
y = train_data['label']

TF-IDF shape: (4869, 5000)


### Building the svm model and fitting it with training data

In [14]:
svm_model = SVC(C=10, kernel='rbf', gamma='scale')
svm_model.fit(X_tfidf_train, y)


### Using the model to predict the sentiment of cleaned text

In [15]:
review_tweet = tfidf_vectors.toarray()
review_sentiment = svm_model.predict(review_tweet)

data_cleaned['sentiment'] = review_sentiment

In [16]:
sentiment_map = {-1: 'Negative', 0: 'Neutral', 1: 'Positive'}
data_cleaned['sentiment_label'] = data_cleaned['sentiment'].map(sentiment_map)

In [17]:
data_cleaned.to_excel("show_review.xlsx", index=False)