In [3]:
import pandas as pd

# Load the CSV file with ISO-8859-1 encoding
df = pd.read_csv('balanced_train.csv', encoding='ISO-8859-1')

df.head()


Unnamed: 0,Sentence,Label
0,Didn`t work for me Except when I used the wo...,Negative
1,"in santa clara a long way from hoe, well not t...",Neutral
2,THANK YOU,Positive
3,mine pools... just in someone elses pocket,Neutral
4,OMGoodness back to school soon FUN!,Positive


In [4]:
# Show counts of each unique value in the 'Label' column
label_counts = df['Label'].value_counts()
print("\nCounts of each Label value:")
print(label_counts)


Counts of each Label value:
Label
Positive    8582
Negative    7781
Neutral     7781
Name: count, dtype: int64


In [5]:
import pandas as pd

# Load the balanced_train CSV file
df_balanced = pd.read_csv('balanced_train.csv', encoding='ISO-8859-1')

# Load the combined_sentences_labeled CSV file
df_combined = pd.read_csv('combined_sentences_labeled.csv', encoding='ISO-8859-1')

# Filter out Neutral and Negative sentences from combined_sentences_labeled
df_combined_filtered = df_combined[df_combined['Label'].isin(['Neutral', 'Negative'])]

# Append the filtered Neutral and Negative sentences to the balanced_train dataset
df_updated = pd.concat([df_balanced, df_combined_filtered[['Sentence', 'Label']]], ignore_index=True)

# Save the updated DataFrame to a new CSV file
df_updated.to_csv('balanced_train_updated.csv', index=False, encoding='ISO-8859-1')

# Display the counts of each Label value to verify
label_counts_updated = df_updated['Label'].value_counts()
print("\nUpdated counts of each Label value:")
print(label_counts_updated)



Updated counts of each Label value:
Label
Negative    9754
Neutral     9426
Positive    8582
Name: count, dtype: int64


# LETS START!!!!

In [6]:
import pandas as pd

# Load the CSV file with ISO-8859-1 encoding
df = pd.read_csv('balanced_train_updated.csv', encoding='ISO-8859-1')

df.head()


Unnamed: 0,Sentence,Label
0,Didn`t work for me Except when I used the wo...,Negative
1,"in santa clara a long way from hoe, well not t...",Neutral
2,THANK YOU,Positive
3,mine pools... just in someone elses pocket,Neutral
4,OMGoodness back to school soon FUN!,Positive


In [7]:
# Display the counts of each Label value to verify
label_counts_updated = df_updated['Label'].value_counts()
print("\nUpdated counts of each Label value:")
print(label_counts_updated)


Updated counts of each Label value:
Label
Negative    9754
Neutral     9426
Positive    8582
Name: count, dtype: int64


In [8]:
# Check for NaN values in the DataFrame
nan_summary = df.isna().sum()

# Display the count of NaN values for each column
print("NaN values in each column:\n", nan_summary)

NaN values in each column:
 Sentence    1
Label       0
dtype: int64


In [9]:
# Remove rows with NaN values in the 'text' or 'sentiment' columns
df_subset_cleaned = df.dropna(subset=['Sentence', 'Label',])

# Verify that NaN values have been removed
nan_summary_after = df_subset_cleaned.isna().sum()
print("\nNaN values in each column after cleaning:\n", nan_summary_after)

df=df_subset_cleaned


NaN values in each column after cleaning:
 Sentence    0
Label       0
dtype: int64


In [10]:
# Check the unique values and their counts in the 'sentiment' column after cleaning
sentiment_counts_cleaned = df_subset_cleaned['Label'].value_counts()


# Display the counts of each sentiment
print("Sentiment counts after removing NaN values:\n", sentiment_counts_cleaned)


Sentiment counts after removing NaN values:
 Label
Negative    9754
Neutral     9425
Positive    8582
Name: count, dtype: int64


In [11]:
df.sample(10)

Unnamed: 0,Sentence,Label
2384,May the 4th be with you #starwarsday (via ),Neutral
24614,She finished her report.,Neutral
3818,ahaha i know. but now i can`t do anything ove...,Negative
17838,Gonna nap n chill then probably go to the movi...,Negative
1329,"http://twitpic.com/4i2zu - wow, thats so cool!",Positive
25333,"She analyzed the proposal, recognizing both st...",Neutral
25748,"You explored the system, finding both function...",Neutral
1086,I wanna go to the extra show really bad,Negative
11894,Just rang the irish one. Drunk. Must confiscat...,Negative
1189,The party has to be moved to next weekend .....,Negative


# TRANSFORMATIONS APPLIED TO TEXT/SENTENCE:-

In [12]:
import pandas as pd
import re
import contractions
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK resources if not already installed
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a set of stopwords
stopwords = set(["a", "an", "the", "is", "in", "of", "to", "and", "for", "with", "on", "at", "by", "it", "this", "that", "which", "who", "whom", "has", "have", "had", "will", "would", "can", "could", "should", "may", "might", "there", "where", "how"])

# Function to get wordnet POS tags for lemmatization
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Function to handle elongated words (e.g., reallllllly -> really)
def reduce_elongation(word):
    return re.sub(r'(.)\1{2,}', r'\1\1', word)  # Keeps two of the repeated letters

# Text cleaning function
def clean_text(text):
    if isinstance(text, str):  # Ensure the input is a string
        # 1. Expand contractions
        text = contractions.fix(text)
        
        # 2. Convert text to lowercase
        text = text.lower()
        
        # 3. Replace elongated words (e.g., "sooooo" -> "soo")
        text = ' '.join([reduce_elongation(word) for word in text.split()])
        
        # 4. Remove special characters (e.g., ãââ½)
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        
        # 5. Remove HTTP URLs
        text = re.sub(r'http\S+', '', text)
        
        # 6. Remove special characters, punctuation, and numbers
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\d+', '', text)  # Remove numbers
        
        # 7. Remove multiple spaces
        text = re.sub(r'\s+', ' ', text).strip()

        # 8. Tokenize the text
        words = word_tokenize(text)
        
        # 9. Remove short words (length < 2)
        words = [word for word in words if len(word) >= 2]
        
        # 10. Remove stopwords
        words = [word for word in words if word not in stopwords]
        
        # 11. Lemmatize the words based on POS tagging
        words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
        
        # Join the words back into a single string
        return ' '.join(words)
    
    return text  # If not a string, return the original value

# Load your DataFrame (assuming df1 is already loaded)
# Apply the cleaning function to the 'text' column and create 'cleaned_text'
df['cleaned_text'] = df['Sentence'].apply(clean_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
# Set pandas to display the full content of the columns
pd.set_option('display.max_colwidth', None)

# Display the first few rows to verify
print(df[['Sentence', 'cleaned_text', 'Label']].sample(20))

                                                                                                                          Sentence  \
23060                                                                                         Totally just skid all over the road.   
8303                                                                         transcribing my tenth (and last) report for the night   
2828                                                                      monday at the school  gosshhh what I`ve been waiting for   
11831   I had fun tonight! I`ll leave you with this...Brandi Carlile will be in Dallas today.  We should just sneak into the show!   
13406        Happy Mother`s Day to all the amazing women who put up with us crazy, demanding children. Thank you.  Very very much.   
19983                                                                                   awwww bless her  she needs another chance.   
17838                   Gonna nap n chill then probably go to 

In [14]:
df

Unnamed: 0,Sentence,Label,cleaned_text
0,Didn`t work for me Except when I used the word autofollow and got followed by an bot selling an autofollow program.,Negative,didnt work me except when use word autofollow get follow bot sell autofollow program
1,"in santa clara a long way from hoe, well not that far. it sure seems like it.",Neutral,santa clara long way from hoe well not far sure seem like
2,THANK YOU,Positive,thank you
3,mine pools... just in someone elses pocket,Neutral,mine pool just someone elses pocket
4,OMGoodness back to school soon FUN!,Positive,omgoodness back school soon fun
...,...,...,...
27757,"Itâs wonderful how often you share, but the posts are lacking in variety.",Negative,it wonderful often you share but post be lack variety
27758,"The visuals are striking, but the actual post is quite flat.",Negative,visuals be strike but actual post quite flat
27759,"Itâs great how polished your posts look, but theyâre not always captivating.",Negative,it great polished your post look but theyre not always captivate
27760,"The design is sleek, but the content is somewhat boring.",Negative,design sleek but content somewhat boring


# Example of How Glove Embedding works to convert text into mulidimensional array

In [15]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import re

# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Path to GloVe embeddings file (e.g., 'glove.6B.100d.txt')
glove_file_path = 'glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path)


In [16]:
import numpy as np
from nltk.tokenize import word_tokenize

# Modify the sentence_to_embedding function to handle NaN or non-string values
def sentence_to_embedding(sentence, embeddings):
    # Ensure the input is a valid string; if not, convert it to an empty string
    if not isinstance(sentence, str):
        sentence = ""
        
    words = word_tokenize(sentence)
    valid_words = [embeddings[word] for word in words if word in embeddings]
    
    if valid_words:
        return np.mean(valid_words, axis=0)
    else:
        return np.zeros(100)  # Assuming 100-dimensional GloVe embeddings

# Convert sentences to embeddings
embedding_matrix = np.array([sentence_to_embedding(text, glove_embeddings) for text in df['cleaned_text']])


In [17]:
words = word_tokenize("followfriday thank you much behind still about half what")
# Print the tokens
print(words)


['followfriday', 'thank', 'you', 'much', 'behind', 'still', 'about', 'half', 'what']


In [18]:
# Create lists to hold present and absent words
present_words = []
absent_words = []

# Check for each word if it is in glove_embeddings
for word in words:
    if word in glove_embeddings:
        present_words.append(word)
    else:
        absent_words.append(word)

# Extract embeddings for present words
valid_words = [glove_embeddings[word] for word in present_words]

# Print results
print("Words present in GloVe embeddings:", present_words)
print("Words absent from GloVe embeddings:", absent_words)
print("Embeddings for present words:")
for word, embedding in zip(present_words, valid_words):
    print(f"Word: {word}, Embedding: {embedding[:5]}...")

Words present in GloVe embeddings: ['thank', 'you', 'much', 'behind', 'still', 'about', 'half', 'what']
Words absent from GloVe embeddings: ['followfriday']
Embeddings for present words:
Word: thank, Embedding: [-0.056244  0.55972   0.4774   -0.22186   0.020482]...
Word: you, Embedding: [-0.49886  0.76602  0.89751 -0.78547 -0.6855 ]...
Word: much, Embedding: [-0.3384   0.6032   0.61412 -0.05686 -0.37309]...
Word: behind, Embedding: [-0.17607  0.32129  0.59174 -0.48619  0.34921]...
Word: still, Embedding: [-0.04248  0.80249  0.51451 -0.55427 -0.13799]...
Word: about, Embedding: [ 0.66039  0.63888  0.86264  0.27455 -0.89222]...
Word: half, Embedding: [ 0.11202  0.60857  0.35559 -0.47541 -0.1657 ]...
Word: what, Embedding: [-0.1518   0.38409  0.8934  -0.42421 -0.92161]...


In [19]:
# Example sentence
sentence = "followfriday thank you much behind still about half what"

# Convert sentence to embedding
embedding = sentence_to_embedding(sentence, glove_embeddings)

print(embedding)

[-6.14304952e-02  5.85532486e-01  6.50863767e-01 -3.41215014e-01
 -3.50802243e-01  3.79183263e-01 -1.88866466e-01  1.31394997e-01
 -3.80120017e-02 -2.49909148e-01  4.29248720e-01  2.14636013e-01
  1.24009117e-01 -1.37242079e-01  1.65671244e-01 -4.16303992e-01
 -1.09373137e-01  1.26152009e-01 -3.27242523e-01  4.08639610e-01
  3.11282247e-01  3.12231362e-01 -9.66031253e-02 -2.23210514e-01
  2.42671266e-01  8.28860328e-04 -2.34162509e-01 -4.27338243e-01
  1.14087753e-01 -4.32371974e-01 -4.54836190e-02  4.14996237e-01
  1.38813615e-01  2.77333464e-02 -6.83845058e-02  2.39871264e-01
 -4.22142446e-02  3.49828988e-01  1.08415015e-01 -1.85637623e-01
 -3.38328242e-01 -2.53160000e-01  3.06167513e-01 -2.31215000e-01
 -3.47438484e-01 -5.73085025e-02  2.38317490e-01 -4.17411238e-01
 -8.39576274e-02 -9.71107483e-01  2.09547937e-01 -5.71529679e-02
  4.11347561e-02  1.17401505e+00 -2.92947501e-01 -2.46105266e+00
 -3.95629965e-02 -1.78059638e-01  1.41188133e+00  4.13980007e-01
  7.49856308e-02  1.03161

In [20]:
import numpy as np

# Define the number of similar words to retrieve
N = 5

# Create a dictionary to store the similar words
similar_words = {}

# Convert the dictionary values to a numpy array for efficient computation
embeddings_array = np.array(list(glove_embeddings.values()))
word_list = list(glove_embeddings.keys())

# Iterate over the present words
for word in present_words:
    # Get the embedding of the current word
    word_embedding = glove_embeddings[word]
    
    # Calculate the cosine similarity with all words in the GloVe embeddings
    similarities = np.dot(embeddings_array, word_embedding) / (np.linalg.norm(embeddings_array, axis=1) * np.linalg.norm(word_embedding))
    
    # Get the index of the current word in the embeddings list
    word_index = word_list.index(word)
    
    # Set similarities of the word itself to -1 to exclude it
    similarities[word_index] = -1
    
    # Get the top N similar words
    top_similarities = np.argsort(-similarities)[:N]
    similar_words[word] = [word_list[i] for i in top_similarities]

# Print the similar words
for word, sims in similar_words.items():
    print(f"Similar words to '{word}': {sims}")


Similar words to 'thank': ['grateful', 'congratulations', 'thanking', 'me', 'wish']
Similar words to 'you': ["'ll", "n't", 'know', 'i', 'do']
Similar words to 'much': ['even', 'more', 'less', 'so', 'too']
Similar words to 'behind': ['away', 'while', 'back', 'out', 'ahead']
Similar words to 'still': ['now', 'already', 'though', 'even', 'but']
Similar words to 'about': ['some', 'than', 'much', 'just', 'there']
Similar words to 'half': ['second', 'third', 'over', 'five', 'almost']
Similar words to 'what': ['how', 'why', 'fact', 'know', 'that']


In [21]:
# Remove rows where 'cleaned_text' is NaN or empty
df = df.dropna(subset=['cleaned_text'])

# Alternatively, you can remove rows where 'cleaned_text' is an empty string
df = df[df['cleaned_text'].str.strip() != '']


# COMBINING VADER(lexicon and rule-based sentiment analysis tool) And Glove Embeddings

In [22]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

# Initialize VADER
sid = SentimentIntensityAnalyzer()

# Function to get VADER sentiment scores
def vader_sentiment_features(text):
    scores = sid.polarity_scores(text)
    return [scores['neg'], scores['neu'], scores['pos'], scores['compound']]

# Apply to your dataset
vader_features = np.array([vader_sentiment_features(text) for text in df['cleaned_text']])


In [23]:
# Prepare the feature matrix and target vector
X = np.array([sentence_to_embedding(text, glove_embeddings) for text in df['cleaned_text']])
y = df['Label']

# Combine GloVe embeddings and VADER sentiment features
X_combined = np.hstack([X, vader_features])


In [24]:
y

0        Negative
1         Neutral
2        Positive
3         Neutral
4        Positive
           ...   
27757    Negative
27758    Negative
27759    Negative
27760    Negative
27761    Negative
Name: Label, Length: 27756, dtype: object

In [25]:
from sklearn.preprocessing import LabelEncoder

# Encode labels into numeric form
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Label'])  # e.g., 'negative' -> 0, 'neutral' -> 1, 'positive' -> 2


In [26]:
y

array([0, 1, 2, ..., 0, 0, 0])

**Using Different ML models to check which works best with our text(using GloVe embeddings and VADER sentiment features) to represent the input data in a form that is understandable by the model**

In [125]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
import numpy as np

# Get target names as a list of strings
target_names = label_encoder.classes_.astype(str)  # Convert to strings if needed

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train a RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Perform cross-validation
cv_scores = cross_val_score(model, X_combined, y, cv=5)  # 5-fold cross-validation

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores)}")



              precision    recall  f1-score   support

    Negative       0.72      0.74      0.73      1903
     Neutral       0.67      0.63      0.65      1947
    Positive       0.73      0.76      0.74      1703

    accuracy                           0.71      5553
   macro avg       0.71      0.71      0.71      5553
weighted avg       0.71      0.71      0.71      5553

Cross-Validation Scores: [0.68233387 0.66804755 0.68335735 0.67993516 0.67164986]
Mean Cross-Validation Score: 0.6770647574253033


In [130]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
import numpy as np

# Get target names as a list of strings
target_names = label_encoder.classes_.astype(str)  # Convert to strings if needed

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train a LogisticRegression model
model1 = LogisticRegression(max_iter=1000, random_state=42)
model1.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred, target_names=target_names))

# Perform cross-validation
cv_scores1 = cross_val_score(model1, X_combined, y, cv=5)  # 5-fold cross-validation

print(f"Cross-Validation Scores: {cv_scores1}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores1)}")


              precision    recall  f1-score   support

    Negative       0.73      0.72      0.72      1903
     Neutral       0.64      0.62      0.63      1947
    Positive       0.72      0.75      0.74      1703

    accuracy                           0.70      5553
   macro avg       0.70      0.70      0.70      5553
weighted avg       0.69      0.70      0.69      5553

Cross-Validation Scores: [0.67441023 0.64949568 0.67201009 0.6721902  0.66066282]
Mean Cross-Validation Score: 0.665753803666113


In [127]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train a DecisionTreeClassifier
model2 = DecisionTreeClassifier(random_state=42)
model2.fit(X_train, y_train)

# Make predictions
y_pred2 = model2.predict(X_test)

# Evaluate the model
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred2, target_names=label_encoder.classes_))

# Perform cross-validation
cv_scores2 = cross_val_score(model2, X_combined, y, cv=5)  # 5-fold cross-validation

print(f"Cross-Validation Scores: {cv_scores2}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores2)}")



Decision Tree Classification Report:
              precision    recall  f1-score   support

    Negative       0.62      0.63      0.62      1903
     Neutral       0.54      0.53      0.54      1947
    Positive       0.63      0.63      0.63      1703

    accuracy                           0.60      5553
   macro avg       0.60      0.60      0.60      5553
weighted avg       0.60      0.60      0.60      5553

Cross-Validation Scores: [0.56077796 0.55097262 0.56448127 0.56646254 0.52791787]
Mean Cross-Validation Score: 0.5541224503617486


In [128]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train a Gaussian Naive Bayes
model3 = GaussianNB()
model3.fit(X_train, y_train)

# Make predictions
y_pred3 = model3.predict(X_test)

# Evaluate the model
print("Gaussian Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred3, target_names=label_encoder.classes_))

# Perform cross-validation
cv_scores3 = cross_val_score(model3, X_combined, y, cv=5)  # 5-fold cross-validation

print(f"Cross-Validation Scores: {cv_scores3}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores3)}")



Gaussian Naive Bayes Classification Report:
              precision    recall  f1-score   support

    Negative       0.62      0.71      0.66      1903
     Neutral       0.59      0.44      0.51      1947
    Positive       0.65      0.73      0.68      1703

    accuracy                           0.62      5553
   macro avg       0.62      0.63      0.62      5553
weighted avg       0.62      0.62      0.61      5553

Cross-Validation Scores: [0.58688997 0.58123199 0.59528098 0.60122478 0.66462536]
Mean Cross-Validation Score: 0.6058506163555697


In [129]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train an SVM model
model4 = SVC(random_state=42)
model4.fit(X_train, y_train)

# Make predictions
y_pred4 = model4.predict(X_test)

# Evaluate the model
print("Support Vector Machine (SVM) Classification Report:")
print(classification_report(y_test, y_pred4, target_names=label_encoder.classes_))

# Perform cross-validation
cv_scores4 = cross_val_score(model4, X_combined, y, cv=5)  # 5-fold cross-validation

print(f"Cross-Validation Scores: {cv_scores4}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores4)}")



Support Vector Machine (SVM) Classification Report:
              precision    recall  f1-score   support

    Negative       0.74      0.75      0.74      1903
     Neutral       0.70      0.62      0.66      1947
    Positive       0.72      0.80      0.76      1703

    accuracy                           0.72      5553
   macro avg       0.72      0.72      0.72      5553
weighted avg       0.72      0.72      0.72      5553

Cross-Validation Scores: [0.6904376  0.67489193 0.69650576 0.68587896 0.67182997]
Mean Cross-Validation Score: 0.6839088459077344


In [28]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Define the hyperparameter space
param_grid = {
    'C': [0.1, 1, 10, 100],       # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001], # Kernel coefficient for 'rbf'
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'] # Kernel types
}

# Initialize the SVC model
svc = SVC(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', verbose=2)

# Fit the model to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions using the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the model
print("Best Hyperparameters:", best_params)
print("Classification Report for the Best Model:")
print(classification_report(y_test, y_pred_best, target_names=label_encoder.classes_))

# Perform cross-validation on the best model
cv_scores_best = cross_val_score(best_model, X_combined, y, cv=5)

print(f"Cross-Validation Scores for the Best Model: {cv_scores_best}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores_best)}")


Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  39.6s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  39.5s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  39.8s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  37.0s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=  37.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 1.1min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 1.1min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 1.1min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 1.2min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 1.1min
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 3.3min
[CV] END ........................C=0.1, gamma=1

NameError: name 'cross_val_score' is not defined

**SVM Worked best out of all, we could try/use Hugging Transformers and CNN,RNN but will increase the complexity of project**

In [27]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
import numpy as np

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train an SVM model using the best hyperparameters
best_params = {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
model4 = SVC(C=best_params['C'], gamma=best_params['gamma'], kernel=best_params['kernel'], random_state=42)
model4.fit(X_train, y_train)

# Make predictions
y_pred4 = model4.predict(X_test)

# Evaluate the model
print("Support Vector Machine (SVM) Classification Report:")
print(classification_report(y_test, y_pred4, target_names=label_encoder.classes_))


Support Vector Machine (SVM) Classification Report:
              precision    recall  f1-score   support

    Negative       0.74      0.75      0.75      1971
     Neutral       0.68      0.63      0.66      1901
    Positive       0.72      0.77      0.75      1680

    accuracy                           0.72      5552
   macro avg       0.71      0.72      0.72      5552
weighted avg       0.71      0.72      0.71      5552



# PREDICTION ON A SAMPLE SENTENCE:-

In [28]:
def predict_sentiment(new_sentence):
    # Convert new sentence to GloVe embedding
    glove_feature = sentence_to_embedding(new_sentence, glove_embeddings).reshape(1, -1)
    
    # Extract VADER sentiment features
    vader_feature = np.array(vader_sentiment_features(new_sentence)).reshape(1, -1)
    
    # Combine GloVe and VADER features
    combined_feature = np.hstack([glove_feature, vader_feature])
    
    # Predict sentiment
    prediction = model4.predict(combined_feature)
    predicted_label = label_encoder.inverse_transform(prediction)
    return predicted_label[0]

# Example of prediction
new_sentence = "What an interesting way to post.I am impressed."
print(f"Predicted sentiment: {predict_sentiment(new_sentence)}")


Predicted sentiment: Positive


# PREDICTION ON A SAMPLE NEPALI SENTENCE:-

In [29]:
from googletrans import Translator
import numpy as np

# Initialize the translator
translator = Translator()

def translate_nepali_to_english(nepali_text):
    # Translate the Nepali text to English
    translation = translator.translate(nepali_text, src='ne', dest='en')
    return translation.text

def predict_sentiment(new_sentence):
    # Step 1: Translate Nepali to English
    translated_sentence = translate_nepali_to_english(new_sentence)
    
    # Step 2: Convert the translated sentence to GloVe embedding
    glove_feature = sentence_to_embedding(translated_sentence, glove_embeddings).reshape(1, -1)
    
    # Step 3: Extract VADER sentiment features from the translated sentence
    vader_feature = np.array(vader_sentiment_features(translated_sentence)).reshape(1, -1)
    
    # Step 4: Combine GloVe and VADER features
    combined_feature = np.hstack([glove_feature, vader_feature])
    
    # Step 5: Predict sentiment
    prediction = model4.predict(combined_feature)
    predicted_label = label_encoder.inverse_transform(prediction)
    
    return predicted_label[0]

# Example of prediction with Nepali text
new_sentence = "तपाईंको पोस्ट धेरै रोचक छ। म प्रभावित भएँ।"
print(f"Predicted sentiment: {predict_sentiment(new_sentence)}")


Predicted sentiment: Positive
