# Twitter Sentiment Analysis

In [39]:
# Importing Libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import requests
import warnings
warnings.filterwarnings('ignore')

## Data Collection

In [3]:
# loading the data from csv file to pandas dataframe
df1 = pd.read_csv('data.csv', encoding ='latin-1', header = None)
df1.columns = ['target','id','date','flag','user','text']


## Data Preprocessing

In [4]:
# check the dataset shape
df1.shape

(1600000, 6)

In [5]:
# print the first 5 rows of the dataframe
df1.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
# checking the number of missing values in the dataset
df1.isna().sum()


target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [7]:
# checking the distribution of target column
df1['target'].value_counts()
# the distribution is even, no need to upscaling or downscaling

0    800000
4    800000
Name: target, dtype: int64

In [8]:
# convert the target '4' to '1'
df1.replace({'target':{4:1}}, inplace = True)

In [9]:
# check the distribution of target column with updated value
df1['target'].value_counts()

0    800000
1    800000
Name: target, dtype: int64

In [10]:
# 0 -- negative tweet
# 1 -- positive tweet

### Text Cleaning

In [11]:
def clean_up(s):
    # Remove URLs
    s = re.sub(r'http\S+', '', s)
    
    # Replace numbers and special characters with a whitespace
    s = re.sub(r'[^A-Za-z\s]', ' ', s)
    
    # Remove extra whitespaces
    s = re.sub(r'\s+', ' ', s).strip()
    
    # Convert to lowercase
    s = s.lower()
    return s

### Tokenization

In [12]:
#  filters out any tokens that are not alphanumeric
def tokenize(s):
    tokens = word_tokenize(s)
    return [token for token in tokens if token.isalnum()]

### Stemming and Lemmatization

In [13]:
def stem_and_lemmatize(l):
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in l]
    return [ps.stem(word) for word in lemmatized]


### Stop Words Removal

In [14]:
def remove_stopwords(l):
    stop_words = set(stopwords.words('english'))
    return [word for word in l if not word in stop_words]


In [15]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/amy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/amy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/amy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
# Build preprocessing pipeline
def preprocess_text(text):
    cleaned_text = clean_up(text)
    tokenized_text = tokenize(cleaned_text)
    stemmed_and_lemmatized_text = stem_and_lemmatize(tokenized_text)
    text_without_stopwords = remove_stopwords(stemmed_and_lemmatized_text)
    return ' '.join(text_without_stopwords)

In [17]:
# Apply the preprocessing pipeline
df1['processed_content'] = df1['text'].apply(preprocess_text)


In [18]:
# check out the new df
df1.head()

Unnamed: 0,target,id,date,flag,user,text,processed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot awww bummer shoulda got david carr ...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat hi facebook text might cri result ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad whi becaus see


In [19]:
# save the results into a new csv file
df1.to_csv('cleaned_df.csv', index=False)

## Modeling

In [20]:
# Select a 10% sample of the dataset for faster computation
df_sample = df1.sample(frac=0.1, random_state=42)
df_sample.shape

(160000, 7)

In [22]:
# Spliting X and y
X = df_sample['processed_content']
y = df_sample['target']

In [23]:
# Splitting the data to training data and test data
X_train,X_test, y_train,y_test = train_test_split(X,y, test_size = 0.2, stratify = y, random_state = 42)

In [24]:
# converting the texual data to numerical data
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test) 

## Model Evaluation

In [25]:
# Define and Train Models
def train_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    return accuracy_score(y_train, train_pred), accuracy_score(y_test, test_pred)

In [29]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC()
}

In [30]:
results = {}
for model_name, model in models.items():
    train_acc, test_acc = train_evaluate_model(model, X_train, y_train, X_test, y_test)
    results[model_name] = {"Train Accuracy": train_acc, "Test Accuracy": test_acc}


In [31]:
results

{'Logistic Regression': {'Train Accuracy': 0.821578125,
  'Test Accuracy': 0.767375},
 'Naive Bayes': {'Train Accuracy': 0.8511171875, 'Test Accuracy': 0.74784375},
 'SVM': {'Train Accuracy': 0.9475546875, 'Test Accuracy': 0.7728125}}

## Choose the model for further tunning

Overall Analysis:

All three models show signs of overfitting, as evidenced by higher accuracies on the training data compared to the test data. 

The SVM, despite having the highest training accuracy, does not significantly outperform the other models on the test set. This suggests that while it learned the training data well, it may not generalize as effectively to new, unseen data.

Logistic Regression and Naive Bayes have more comparable performance, with Logistic Regression slightly outperforming Naive Bayes on the test set.

Conclusion:

Balanced Performance: Logistic Regression demonstrates a good balance between training and test accuracies (82.16% and 76.74% respectively). While it doesn't have the highest training accuracy, its test accuracy is competitive, indicating a reasonable generalization to unseen data.

Efficiency in Training: Unlike SVM, which exhibits prolonged training times making it less practical for large datasets, Logistic Regression offers a more time-efficient alternative. This efficiency is crucial, especially when working with large datasets or in scenarios where model retraining is frequent.

## Hyperparameter Tuning for Best Model


In [32]:
# Define a dictionary of hyperparameters to test.
# 'C' represents the inverse of regularization strength. Smaller values specify stronger regularization.
# 'solver' is the algorithm used for optimization. Different solvers can perform differently depending on the data.
param_grid = {
    'C': [100, 10, 1.0, 0.1, 0.01],  # Regularization values
    'solver': ['newton-cg', 'lbfgs', 'liblinear']  # Optimization algorithms
}

# Create a GridSearchCV object.
# LogisticRegression() is the model to tune.
# param_grid is the range of parameters to test.
# cv=5 specifies 5-fold cross-validation.
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)

# Fit the GridSearchCV object to the training data.
# This will test all combinations of parameters in param_grid
# using 5-fold cross-validation and identify the best combination.
grid.fit(X_train, y_train)


In [34]:
# Evaluation
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Best Cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Test accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

Confusion Matrix:
[[11918  4044]
 [ 3399 12639]]

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.76     15962
           1       0.76      0.79      0.77     16038

    accuracy                           0.77     32000
   macro avg       0.77      0.77      0.77     32000
weighted avg       0.77      0.77      0.77     32000

Best Cross-validation score: 0.76
Best parameters:  {'C': 1.0, 'solver': 'lbfgs'}
Test accuracy: 76.74%


## Save the best model

In [36]:
import pickle

In [37]:
# Save the model to a file
with open('best_logistic_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

## Scrape data from Twitter

In [41]:
def load_api_key(file_path):
    # This function reads the API key from a file.
    try:
        with open(file_path, 'r') as file:
            return file.read().strip()
    except FileNotFoundError:
        # If the file is not found, an error message is printed.
        print("API key file not found.")
        return None

# Specify the path to your API key file
api_key_file = 'api_key.txt'
# Load the API key
api_key = load_api_key(api_key_file)

In [121]:
# Check if the API key is available
if not api_key:
    print("API key is required to proceed.")
else:
    # Initialize an empty list to store the collected tweets and a set for tweet IDs to avoid duplicates
    twitter_data = []
    tweet_ids = set() 

    # Set the number of tweets to fetch per request and the total number of tweets needed
    num_tweets_per_request = 100
   
    # Prepare the payload for the API request with the API key, query term, number of tweets, and starting point
    payload = {
        'api_key': api_key,
        'query': 'coursera',
        'num': str(num_tweets_per_request),
        'start': '0'  # 'start' is set to '0' and is not updated in subsequent iterations
    }

    # Make the API request
    response = requests.get('https://api.scraperapi.com/structured/twitter/search', params=payload)

    # Check if the response from the API is successful
    if response.status_code == 200:
        data = response.json()
        all_tweets = data.get('organic_results', [])  # Extract tweets from the response

        # Filter out tweets that have already been collected, based on their IDs
        new_tweets = [tweet for tweet in all_tweets if tweet.get('id') not in tweet_ids]
        twitter_data.extend(new_tweets)  # Add new tweets to the collection

        # Update the set of seen tweet IDs to include the IDs of the newly fetched tweets
        tweet_ids.update(tweet.get('id') for tweet in new_tweets)
    else:
        # If the request fails, print an error message with the status code
        print("Failed to fetch data. Status code:", response.status_code)


In [126]:
df2 = pd.DataFrame(twitter_data)
# Save the DataFrame to a CSV file.
df2.to_csv('tweets_scraped.csv', index=False)

In [123]:
# check the shape of scrapped data
df2.shape

(100, 6)

In [124]:
# check whether the data was duplicated
df2['snippet'].duplicated().sum()

0

In [128]:
df2.head()

Unnamed: 0,position,title,snippet,highlighs,link,displayed_link
0,0,Coursera,LearnWithoutLimits on Coursera. Access online ...,[Coursera],https://twitter.com/coursera,https://twitter.com › coursera
1,1,Coursera,"For a limited time, we're offering the first m...",[Coursera],https://mobile.twitter.com/coursera/status/172...,https://mobile.twitter.com › coursera › status
2,2,Coursera,Coursera · @coursera. Unlock new opportunities...,"[Coursera, coursera]",https://mobile.twitter.com/coursera/status/170...,https://mobile.twitter.com › coursera › status
3,3,Coursera,Enrolling in a Coursera course or program can ...,[Coursera],https://twitter.com/coursera/status/1692554542...,https://twitter.com › coursera › status
4,4,Coursera,Looking to take the next step in your career? ...,[Coursera],https://twitter.com/coursera/status/1666806122...,https://twitter.com › coursera › status


## Clean and process scraped data

In [129]:
df2.isna().sum()

position          0
title             0
snippet           1
highlighs         1
link              0
displayed_link    0
dtype: int64

In [130]:
# since the number of nan is not big, drop na
df2.dropna(inplace=True)

In [131]:
# check the data shape after dropping
df2.shape

(99, 6)

In [132]:
# Apply the preprocessing pipeline to df2
df2['processed_content'] = df2['snippet'].apply(preprocess_text)

In [133]:
df2.head()

Unnamed: 0,position,title,snippet,highlighs,link,displayed_link,processed_content
0,0,Coursera,LearnWithoutLimits on Coursera. Access online ...,[Coursera],https://twitter.com/coursera,https://twitter.com › coursera,learnwithoutlimit coursera access onlin cours ...
1,1,Coursera,"For a limited time, we're offering the first m...",[Coursera],https://mobile.twitter.com/coursera/status/172...,https://mobile.twitter.com › coursera › status,limit time offer first month coursera plu acce...
2,2,Coursera,Coursera · @coursera. Unlock new opportunities...,"[Coursera, coursera]",https://mobile.twitter.com/coursera/status/170...,https://mobile.twitter.com › coursera › status,coursera coursera unlock new opportun learn co...
3,3,Coursera,Enrolling in a Coursera course or program can ...,[Coursera],https://twitter.com/coursera/status/1692554542...,https://twitter.com › coursera › status,enrol coursera cours program help unemploy cit...
4,4,Coursera,Looking to take the next step in your career? ...,[Coursera],https://twitter.com/coursera/status/1666806122...,https://twitter.com › coursera › status,look take next step career enjoy first month c...


## Predict sentiment for scraped data

In [134]:
# Load the model from the file
with open('best_logistic_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [135]:
# Extract the text data for prediction
X_new = df2['processed_content']

In [136]:
# Transform the new text data to the same feature space as the model was trained on
X_new_vect = vectorizer.transform(X_new) 

In [137]:
# Use the loaded model to make predictions on the transformed data
prediction_new = loaded_model.predict(X_new_vect)

In [138]:
# Add target to original dataframe
df2['target'] = prediction_new

In [139]:
df2.head()

Unnamed: 0,position,title,snippet,highlighs,link,displayed_link,processed_content,target
0,0,Coursera,LearnWithoutLimits on Coursera. Access online ...,[Coursera],https://twitter.com/coursera,https://twitter.com › coursera,learnwithoutlimit coursera access onlin cours ...,1
1,1,Coursera,"For a limited time, we're offering the first m...",[Coursera],https://mobile.twitter.com/coursera/status/172...,https://mobile.twitter.com › coursera › status,limit time offer first month coursera plu acce...,1
2,2,Coursera,Coursera · @coursera. Unlock new opportunities...,"[Coursera, coursera]",https://mobile.twitter.com/coursera/status/170...,https://mobile.twitter.com › coursera › status,coursera coursera unlock new opportun learn co...,1
3,3,Coursera,Enrolling in a Coursera course or program can ...,[Coursera],https://twitter.com/coursera/status/1692554542...,https://twitter.com › coursera › status,enrol coursera cours program help unemploy cit...,1
4,4,Coursera,Looking to take the next step in your career? ...,[Coursera],https://twitter.com/coursera/status/1666806122...,https://twitter.com › coursera › status,look take next step career enjoy first month c...,1


In [141]:
df2['target'].value_counts()

1    86
0    13
Name: target, dtype: int64

In [140]:
# Save the new DataFrame to a CSV file.
df2.to_csv('tweets_scraped_labeled.csv', index=False)