## Importing Libraries

In [30]:
import re
import nltk
import pandas as pd
import random
import matplotlib.pyplot as plt
import nltk
import numpy as np
import seaborn as sns
import joblib


from nltk.corpus import opinion_lexicon
from nltk.tokenize import word_tokenize
from google.colab import drive
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [15]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
cd drive/MyDrive/sentiment_data/trainingandtestdata

[Errno 2] No such file or directory: 'drive/MyDrive/sentiment_data/trainingandtestdata'
/content/drive/MyDrive/sentiment_data/trainingandtestdata


In [17]:
ls

[0m[01;34mglove_embeddings[0m/                               testdata.manual.2009.06.14.csv
logistic_regression_model_with_cleaning.pkl     tfidf_vectorizer.pkl
logistic_regression_model_without_cleaning.pkl  tfidf_vectorizer_with_cleaning.pkl
lr_model.pkl                                    tfidf_vectorizer_without_cleaning.pkl
svc_model.pkl                                   training.1600000.processed.noemoticon.csv


In [18]:
# Load the training and test datasets
train_file_path = 'training.1600000.processed.noemoticon.csv'
test_file_path = 'testdata.manual.2009.06.14.csv'

# The training dataset does not have a header
train_columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
train_data = pd.read_csv(train_file_path, names=train_columns, encoding='latin1')

# The test dataset does not have a header
test_columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
test_data = pd.read_csv(test_file_path, names=test_columns, encoding='latin1')

# Display the first few rows of each dataset
print("Training Data:")
print(train_data.head())

print("\nTest Data:")
print(test_data.head())


Training Data:
   target         ids                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  

Test Data:
   target  ids                          date     flag      user  \
0       4    3  Mon May 11 03:17:40 UTC 2009  kindle2    tpryan   
1       4    4  

### Cleaning the text with links and stop words

In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def enhanced_clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-alphabetic characters
    text = text.lower()  # Lowercase
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

train_data['cleaned_text'] = train_data['text'].apply(enhanced_clean_text)
test_data['cleaned_text'] = test_data['text'].apply(enhanced_clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Downloading Glove Weights

In [None]:
# Download GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d glove.6B/

# Load GloVe embeddings
import numpy as np

def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_file = 'glove.6B/glove.6B.50d.txt'
embeddings_index = load_glove_embeddings(glove_file)


### Selecting Glove 50d weights

In [26]:
glove_file = 'glove_embeddings/glove.6B.50d.txt'
embeddings_index = load_glove_embeddings(glove_file)

In [None]:
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings


def get_glove_embeddings(text, embeddings_index, dim=50):
    words = text.split()
    word_embeddings = [embeddings_index.get(word, np.zeros(dim)) for word in words]
    if len(word_embeddings) == 0:
        return np.zeros(dim)
    else:
        return np.mean(word_embeddings, axis=0)

X_train_glove = np.array([get_glove_embeddings(text, embeddings_index) for text in train_data['cleaned_text']])
X_test_glove = np.array([get_glove_embeddings(text, embeddings_index) for text in test_data['cleaned_text']])

# Split the data into training and testing sets
processed_text = train_data['cleaned_text']
sentiment = train_data['target']


### Training Model

In [35]:
from sklearn.utils.class_weight import compute_class_weight

X_train, X_test_train, y_train, y_test_train = train_test_split(processed_text, sentiment, test_size=0.05, random_state=0)

X_train_glove = np.array([get_glove_embeddings(text, embeddings_index) for text in X_train])
X_test_train_glove = np.array([get_glove_embeddings(text, embeddings_index) for text in X_test_train])
X_test_glove = np.array([get_glove_embeddings(text, embeddings_index) for text in test_data['cleaned_text']])


# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {np.unique(y_train)[i]: class_weights[i] for i in range(len(class_weights))}
print(f"Class weights: {class_weight_dict}")

# Train the model with class weights
SVCmodel = LinearSVC(class_weight=class_weight_dict)
SVCmodel.fit(X_train_glove, y_train)
print(f'Model trained.')

# Save the model and vectorizer
joblib.dump(SVCmodel, 'svc_model.pkl')
print(f'Model saved.')

Class weights: {0: 0.999985526525274, 4: 1.0000144738937011}




Model trained.
Model saved.


### Running Evaluation

In [40]:
def model_Evaluate(model, X_test, y_test):
    # Predict values for the test dataset
    y_pred = model.predict(X_test)

    # Print the evaluation metrics for the dataset
    print(classification_report(y_test, y_pred))


# Evaluate the model on the internal test set
print("Evaluation on internal test set:")
model_Evaluate(SVCmodel, X_test_train_glove, y_test_train)

# Evaluate the model on the external test data
print("Evaluation on external test data:")
model_Evaluate(SVCmodel, X_test_glove, test_data['target'])

# Function to predict sentiment of a single sentence
def predict_sentiment(sentence):
    # Preprocess the sentence
    cleaned_sentence = enhanced_clean_text(sentence)
    # Get GloVe embeddings for the sentence
    glove_embedding = get_glove_embeddings(cleaned_sentence, embeddings_index)
    # Predict the sentiment
    prediction = SVCmodel.predict([glove_embedding])
    return prediction[0]

# Example usage
example_sentence = "I love sunny days but I hate the rain."
predicted_sentiment = predict_sentiment(example_sentence)
print(f"Predicted sentiment for '{example_sentence}': {predicted_sentiment}")



Evaluation on internal test set:
              precision    recall  f1-score   support

           0       0.67      0.66      0.67     39989
           4       0.67      0.67      0.67     40011

    accuracy                           0.67     80000
   macro avg       0.67      0.67      0.67     80000
weighted avg       0.67      0.67      0.67     80000

Evaluation on external test data:
              precision    recall  f1-score   support

           0       0.64      0.71      0.67       177
           2       0.00      0.00      0.00       139
           4       0.50      0.84      0.63       182

    accuracy                           0.56       498
   macro avg       0.38      0.51      0.43       498
weighted avg       0.41      0.56      0.47       498

Predicted sentiment for 'I love sunny days but I hate the rain.': 4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
