<a href="https://colab.research.google.com/github/Sandr001/Data-Mining-MGT7216/blob/development/MGT7216_Sentiment_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Sentiment Analysis using Semi-Supervised Learning - 1**
**Self Learning Approach:**

**Import necessary packages**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
df = pd.read_excel("/content/drive/MyDrive/Data Mining/A_II_Emotion_Data_Student_Copy_Final.xlsx")

**Create a Definition: Custom cleaning function**

In [3]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = text.lower()  # Lowercase text
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove words with 1 or 2 letters
    text = re.sub(r'[^a-z\s]', '', text)  # Keep text with letters and spaces

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

**Clean the Reviews**

In [7]:
df['cleaned_reviews'] = df['text_reviews_'].apply(clean_text)
print(df["cleaned_reviews"])

0          fast shipping cloth finally fit well satisfied
1       wanted say delighted friendly helpful staff me...
2       order took day snow day weekend happy sweater ...
3       wouldnt give star ordered coat week ago receiv...
4       parcel never arrived chasing refund substantia...
                              ...                        
5717    given star could known evri hermes courier wou...
5718    wanted change delivery location item within ho...
5719    waiting since believe sent order wrong country...
5720    complain late delivery horrendous compnay revi...
5721    placed order th november alert order could tak...
Name: cleaned_reviews, Length: 5722, dtype: object


In [41]:
# !pip install langdetect
# !pip install googletrans==4.0.0-rc1



In [None]:
from googletrans import Translator
from langdetect import detect

# Assuming df is your DataFrame and 'Text Reviews' is the column containing text reviews
translator = Translator()

# Translate non-English texts to English
def translate_to_english(text):
    try:
        # Check if the language of the text is not English
        if detect(text) != 'en':
            translated_text = translator.translate(text, dest='en').text
            return translated_text
        else:
            return text  # Return original text if already in English
    except:
        return text  # Return original text if translation fails

# Apply translation to non-English texts
df['cleaned_reviews'] = df['cleaned_reviews'].apply(translate_to_english)

# Print the DataFrame to verify the translated texts
print(df)


**Perform Self-Learning**

Split data into labeled and unlabeled

In [8]:
unlabeled_data = df[df['emotions_'] == 'NaN'][['cleaned_reviews']]
unlabeled_data['emotions_'] = -1
print(unlabeled_data)

Empty DataFrame
Columns: [cleaned_reviews, emotions_]
Index: []


**Extract X and y from labeled_data**

In [9]:
# Define labeled data as data where "Sentiment" is not missing
labeled_data = df[df['emotions_'].notna() & (df['emotions_'] != 'NaN')]
# Extract labels from labeled_data
y_labeled = labeled_data['emotions_']
y_unlabeled = unlabeled_data['emotions_']
X_labeled = labeled_data['cleaned_reviews']
X_unlabeled = unlabeled_data['cleaned_reviews']

print("y_labeled ",y_labeled )
print("y_unlabeled",y_unlabeled)
print("X_unlabeled",X_unlabeled)
print("X_labeled",X_labeled)
print(df)

y_labeled  2            joy
3           fear
10      surprise
18         anger
25       neutral
          ...   
5692     sadness
5695        fear
5701       anger
5711         joy
5715       anger
Name: emotions_, Length: 627, dtype: object
y_unlabeled Series([], Name: emotions_, dtype: int64)
X_unlabeled Series([], Name: cleaned_reviews, dtype: object)
X_labeled 2       order took day snow day weekend happy sweater ...
3       wouldnt give star ordered coat week ago receiv...
10      went glasgow saturday bought kid stuff nd floo...
18      trying order product online cost option click ...
25                                               use evri
                              ...                        
5692     dont order online store tooooo much headache prb
5695    use evri formally hermes courier lost confiden...
5701    staff westfield stratford rude went voucher vo...
5711    packet came week super satisfiedeverything fit...
5715    received item wrong colour damaged contacted 

**Pipelines**

In [10]:
# Parameters
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log_loss")
vectorizer_params = dict(ngram_range=(1, 2), min_df=1, max_df=0.8)

# Supervised Pipeline
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier(**sdg_params)),
    ]
)
# SelfTraining Pipeline
st_pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
    ]
)

**Define a function for a classification report**

In [11]:
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1)) #if x == 'NaN'
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    #print("y Train", y_train)
    print("y Predict",y_pred)
    print("y Test",y_test)

    print(
        "Micro-averaged F1 score on test set: %0.3f"
        % f1_score(y_test, y_pred, average="micro")
    )
    print("\nConfusion Matrix:\n", confusion_matrix(y_test,y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred,zero_division=1))
    print("\n\n")


**Split the data**

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, stratify=y_labeled, random_state=42)

**Supervised SGDClassifier on the labeled data**

In [13]:
print("Supervised SGDClassifier on the labeled data:")
eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)

Supervised SGDClassifier on the labeled data:
Number of training samples: 501
Unlabeled samples in training set: 0
y Predict ['surprise' 'joy' 'surprise' 'surprise' 'neutral' 'joy' 'joy' 'fear'
 'disgust' 'surprise' 'anger' 'surprise' 'joy' 'joy' 'joy' 'disgust'
 'anger' 'sadness' 'neutral' 'sadness' 'sadness' 'surprise' 'joy'
 'neutral' 'fear' 'disgust' 'fear' 'disgust' 'surprise' 'anger' 'disgust'
 'neutral' 'surprise' 'neutral' 'sadness' 'sadness' 'disgust' 'surprise'
 'disgust' 'surprise' 'sadness' 'sadness' 'neutral' 'fear' 'sadness'
 'surprise' 'disgust' 'neutral' 'sadness' 'surprise' 'sadness' 'joy'
 'anger' 'sadness' 'fear' 'neutral' 'fear' 'neutral' 'surprise' 'joy'
 'fear' 'neutral' 'neutral' 'joy' 'sadness' 'surprise' 'joy' 'neutral'
 'disgust' 'joy' 'surprise' 'sadness' 'fear' 'anger' 'neutral' 'fear'
 'surprise' 'disgust' 'surprise' 'sadness' 'surprise' 'fear' 'disgust'
 'fear' 'neutral' 'neutral' 'joy' 'neutral' 'joy' 'neutral' 'disgust'
 'surprise' 'joy' 'sadness' 'joy' 

ValueError: Length of values (126) does not match length of index (5722)

**Self Training Classifier on the labeled data**

In [17]:
print("Self Training Classifier on the labeled data:")
eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)

Self Training Classifier on the labeled data:
Number of training samples: 501
Unlabeled samples in training set: 0
Micro-averaged F1 score on test set: 0.563

Confusion Matrix:
 [[ 3  3  0  1  2  0  2]
 [ 1  7  1  1  1  3  2]
 [ 0  3  9  0  0  2  2]
 [ 1  0  1 14  0  2  2]
 [ 1  1  1  0 13  1  3]
 [ 0  1  1  2  5  9  2]
 [ 0  3  1  1  1  1 16]]

Classification Report:
               precision    recall  f1-score   support

       anger       0.50      0.27      0.35        11
     disgust       0.39      0.44      0.41        16
        fear       0.64      0.56      0.60        16
         joy       0.74      0.70      0.72        20
     neutral       0.59      0.65      0.62        20
     sadness       0.50      0.45      0.47        20
    surprise       0.55      0.70      0.62        23

    accuracy                           0.56       126
   macro avg       0.56      0.54      0.54       126
weighted avg       0.57      0.56      0.56       126








**Self Training Classifier on the labeled and unlabeled data**

**Manage Labeled and Unlabeled Data**

In [18]:
test_indices = X_test.index
#print("TEST INDICES",test_indices)

# Exclude test data from X_labeled and y_labeled based on the identified indices
X_labeled_filtered = X_labeled.drop(index=test_indices, errors='ignore')
y_labeled_filtered = y_labeled.drop(index=test_indices, errors='ignore')

# Concatenate the filtered labeled data with the unlabeled data
X=X_combined = pd.concat([X_labeled_filtered, X_unlabeled])
y=y_combined = pd.concat([y_labeled_filtered, y_unlabeled])

**Mapping Labels**

In [19]:
# Define the mapping for labels
# label_mapping = {'Positive': 1, 'Negative': 0, -1:-1 }
label_mapping = {
    'anger': 1,
    'joy': 2,
    'sadness': 3,
    'fear': 4,
    'disgust': 5,
    'surprise': 6,
    'neutral': 7,
    -1:-1
}
# Apply the mapping to labels
y  = [label_mapping[label] for label in y]
#print(y)
y_test  = [label_mapping[label] for label in y_test]
#print(y_test)

In [34]:
print("Self Training Classifier on the labeled and unlabeled data:")
eval_and_print_metrics(st_pipeline, X, y, X_test, y_test)

Self Training Classifier on the labeled and unlabeled data:
Number of training samples: 501
Unlabeled samples in training set: 0
y Predict [6 2 6 6 7 2 2 4 5 6 1 6 2 2 2 5 1 3 7 3 3 6 2 7 4 5 4 5 6 5 5 7 6 7 3 3 3
 6 5 6 3 7 1 4 3 6 5 7 3 6 3 2 4 3 4 2 4 7 7 2 4 7 7 2 3 6 2 5 5 2 4 6 6 1
 7 4 6 5 6 3 6 4 5 4 7 7 2 7 2 2 3 6 2 3 2 6 2 3 2 4 3 7 3 2 7 5 5 3 4 6 7
 4 6 5 4 3 6 6 7 2 6 6 7 2 7 5]
y Test [6, 2, 6, 6, 7, 3, 6, 7, 5, 6, 1, 1, 2, 2, 2, 6, 5, 1, 7, 7, 6, 7, 2, 3, 2, 5, 4, 5, 6, 1, 5, 7, 3, 7, 4, 2, 5, 6, 4, 5, 4, 3, 7, 6, 3, 7, 7, 7, 5, 6, 3, 2, 3, 2, 4, 7, 5, 7, 6, 2, 4, 7, 3, 1, 3, 6, 5, 3, 6, 2, 4, 5, 4, 1, 1, 4, 7, 5, 3, 4, 6, 5, 3, 2, 7, 6, 2, 7, 2, 3, 5, 6, 2, 3, 2, 6, 2, 3, 2, 4, 3, 5, 3, 2, 1, 5, 1, 3, 4, 4, 7, 4, 2, 1, 4, 3, 6, 6, 7, 4, 6, 1, 7, 6, 3, 6]
Micro-averaged F1 score on test set: 0.532

Confusion Matrix:
 [[ 2  1  1  0  3  2  2]
 [ 0 15  2  2  0  1  0]
 [ 0  2  9  1  2  2  4]
 [ 0  1  3  9  1  2  0]
 [ 1  1  3  2  6  2  1]
 [ 0  2  1  1  3 14  2]
 [ 1  1  1 



ValueError: Length of values (126) does not match length of index (5722)