<a href="https://colab.research.google.com/github/Sandr001/Data-Mining-MGT7216/blob/development/MGT7216_Sentiment_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Sentiment Analysis using Semi-Supervised Learning - 1**
**Self Learning Approach:**

**Import necessary packages**

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
import pandas as pd
import numpy as np
import re

import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [41]:
df = pd.read_excel("/content/drive/MyDrive/Data Mining/A_II_Emotion_Data_Student_Copy_Final.xlsx")

**Descriptive Statistics**

In [42]:
print("\033[1mThe dimension of the data:\033[0m", df.shape)

null_counts = df.isnull().sum()
print("\n\033[1mColumn wise Null value counts:\033[0m")
print(null_counts)

for column in ["brand_name_", "country_", "star_rating_"]:
    print("\n\033[1mUnique levels for column '{}':\033[0m".format(column), df[column].unique())

print(f"\n\033[1mCount for the Countries:\033[0m {len(df['country_'].unique())}")

print(f"\n\033[1m Labeled Data Count:\033[0m {df['emotions_'].notnull().sum()}")

print(f"\n\033[1m Unlabeled Data Count:\033[0m {df['emotions_'].isnull().sum()}")

[1mThe dimension of the data:[0m (5722, 6)

[1mColumn wise Null value counts:[0m
ID_                 0
brand_name_         0
country_            0
star_rating_        0
emotions_        5095
text_reviews_       0
dtype: int64

[1mUnique levels for column 'brand_name_':[0m ['Z_' 'H_']

[1mUnique levels for column 'country_':[0m ['US' 'GB' 'FRI' 'HR' 'NO' 'IE' 'CA' 'DK' 'PT' 'ES' 'AT' 'IN' 'IT' 'NL'
 'DE' 'AU' 'CZ' 'RO' 'FI' 'FR' 'GR' 'SI' 'HK' 'HU' 'MY' 'UA' 'MX' 'TR'
 'ZA' 'BE' 'RS' 'AE' 'SE' 'EN' 'CH' 'MAL' 'PL' 'LV' 'MK' 'IS' 'LB' 'PH'
 'JP' 'SG' 'LU' 'PE' 'SK' 'LT' 'BR' 'CO' 'TN' 'TH' 'CY' 'IL' 'HN' 'DO'
 'VN' 'CL' 'AR' 'UY' 'IR' 'PK' 'PA' 'NZ' 'GE' 'RE' 'MT' 'VE' 'CW' 'NI'
 'PR' 'AM' 'CN' 'KR' 'MQ' 'BG' 'MA' 'ID' 'EC' 'BD' 'EE' 'DZ' 'SX' 'KE'
 'AL' 'RU' 'SM' 'XK' 'JM' '.PL' 'MD' 'OM' 'BF']

[1mUnique levels for column 'star_rating_':[0m [5 1 2 3 4]

[1mCount for the Countries:[0m 93

[1m Labeled Data Count:[0m 627

[1m Unlabeled Data Count:[0m 5095


In [34]:
# !pip install langdetect

from langdetect import detect

df_reviews_filtered = df.dropna(subset=['text_reviews_']).loc[df['text_reviews_'] != '']

# Function to detect language
def detect_language(text):
    try:
        lang = detect(text)
        return lang
    except:
        return 'unknown'

# Apply language detection function to the text reviews column
df_reviews_filtered['language'] = df_reviews_filtered['text_reviews_'].apply(detect_language)

# Count the number of text reviews that are not in English or are empty
non_english_count = df_reviews_filtered[(df_reviews_filtered['language'] != 'en')].shape[0]

print("Count for text reviews that are not in English or are empty:", non_english_count)


Count for text reviews that are not in English or are empty: 2922


**Create a Definition: Custom cleaning function**

In [44]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = text.lower()  # Lowercase text
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove words with 1 or 2 letters
    text = re.sub(r'[^a-z\s]', '', text)  # Keep text with letters and spaces

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

**Clean the Reviews**

In [45]:
df['cleaned_reviews'] = df['text_reviews_'].apply(clean_text)
print("Total cleaned reviews = ", len(df['cleaned_reviews']), "\n\n", df["cleaned_reviews"])

Total cleaned reviews =  5722 

 0          fast shipping cloth finally fit well satisfied
1       wanted say delighted friendly helpful staff me...
2       order took day snow day weekend happy sweater ...
3       wouldnt give star ordered coat week ago receiv...
4       parcel never arrived chasing refund substantia...
                              ...                        
5717    given star could known evri hermes courier wou...
5718    wanted change delivery location item within ho...
5719    waiting since believe sent order wrong country...
5720    complain late delivery horrendous compnay revi...
5721    placed order th november alert order could tak...
Name: cleaned_reviews, Length: 5722, dtype: object


In [46]:
!pip install langdetect
!pip install googletrans==4.0.0-rc1



In [47]:
from googletrans import Translator
from langdetect import detect

# Assuming df is your DataFrame and 'Text Reviews' is the column containing text reviews
translator = Translator()

# Translate non-English texts to English
def translate_to_english(text):
    try:
        # Check if the language of the text is not English
        if detect(text) != 'en':
            translated_text = translator.translate(text, dest='en').text
            return translated_text
        else:
            return text  # Return original text if already in English
    except:
        return text  # Return original text if translation fails

# Apply translation to non-English texts
df['cleaned_reviews'] = df['cleaned_reviews'].apply(translate_to_english)

# Print the DataFrame to verify the translated texts
print(df)


         ID_ brand_name_ country_  star_rating_ emotions_  \
0        ID1          Z_       US             5       NaN   
1       ID10          Z_       GB             5       NaN   
2      ID100          H_      FRI             5       joy   
3     ID1000          H_       GB             1      fear   
4     ID1001          H_       GB             1       NaN   
...      ...         ...      ...           ...       ...   
5717   ID995          H_       GB             1       NaN   
5718   ID996          H_       GB             1       NaN   
5719   ID997          H_       CY             1       NaN   
5720   ID998          H_       GB             1       NaN   
5721   ID999          H_       NL             1       NaN   

                                          text_reviews_  \
0     Fast shipping! All cloths was finally fits wel...   
1     Just wanted to say how delighted I was with th...   
2     My order took 6 days with a snow day and a wee...   
3     Wouldnt give them no star

**Perform Self-Learning**

Split data into labeled and unlabeled

In [48]:
# unlabeled_data = df[df['emotions_'] == 'NaN'][['cleaned_reviews']]

unlabeled_data = df[pd.isnull(df['emotions_'])][['cleaned_reviews']]

print(len(unlabeled_data))

unlabeled_data['emotions_'] = -1

unlabeled_data

5095


Unnamed: 0,cleaned_reviews,emotions_
0,fast shipping cloth finally fit well satisfied,-1
1,wanted say delighted friendly helpful staff me...,-1
4,parcel never arrived chasing refund substantia...,-1
5,avoid ordering online ordered day ago evri pac...,-1
6,awful service returning item booked collection...,-1
...,...,...
5717,given star could known evri hermes courier wou...,-1
5718,wanted change delivery location item within ho...,-1
5719,waiting since believe sent order wrong country...,-1
5720,complain late delivery horrendous compnay revi...,-1


**Extract X and y from labeled_data**

In [118]:
# Define labeled data as data where "Sentiment" is not missing
labeled_data = df[df['emotions_'].notnull() & (df['emotions_'] != 'NaN')]

print(len(labeled_data))

# Extract labels from labeled_data
y_labeled = labeled_data['emotions_']
y_unlabeled = unlabeled_data['emotions_']
X_labeled = labeled_data['cleaned_reviews']
X_unlabeled = unlabeled_data['cleaned_reviews']

print("y_labeled ",y_labeled )
print("y_unlabeled",y_unlabeled)
print("X_unlabeled",X_unlabeled)
print("X_labeled",X_labeled)
print(df)

627
y_labeled  2            joy
3           fear
10      surprise
18         anger
25       neutral
          ...   
5692     sadness
5695        fear
5701       anger
5711         joy
5715       anger
Name: emotions_, Length: 627, dtype: object
y_unlabeled 0      -1
1      -1
4      -1
5      -1
6      -1
       ..
5717   -1
5718   -1
5719   -1
5720   -1
5721   -1
Name: emotions_, Length: 5095, dtype: int64
X_unlabeled 0          fast shipping cloth finally fit well satisfied
1       wanted say delighted friendly helpful staff me...
4       parcel never arrived chasing refund substantia...
5       avoid ordering online ordered day ago evri pac...
6       awful service returning item booked collection...
                              ...                        
5717    given star could known evri hermes courier wou...
5718    wanted change delivery location item within ho...
5719    waiting since believe sent order wrong country...
5720    complain late delivery horrendous compnay revi

**Pipelines**

In [119]:
from sklearn.svm import SVC  # Import Support Vector Classification

# Parameters
svm_params = dict(kernel='linear', probability=True)  # Adjust parameters as needed
vectorizer_params = dict(ngram_range=(1, 2), min_df=1, max_df=0.8)

# Supervised Pipeline with SVM
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SVC(**svm_params)),
    ]
)
# SelfTraining Pipeline
st_pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SelfTrainingClassifier(SVC(**svm_params), verbose=True)),
    ]
)

In [None]:
# Parameters
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log_loss")
vectorizer_params = dict(ngram_range=(1, 2), min_df=1, max_df=0.8)

# Supervised Pipeline
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier(**sdg_params)),
    ]
)
# SelfTraining Pipeline
st_pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
    ]
)

**Define a function for a classification report**

In [127]:
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    import numpy as np
    np.random.seed(42)  # Set random seed
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1)) #if x == NULL
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    #print("y Train", y_train)
    print("y Predict",y_pred)
    print("y Test",y_test)

    print(
        "Micro-averaged F1 score on test set: %0.3f"
        % f1_score(y_test, y_pred, average="micro")
    )
    print("\nConfusion Matrix:\n", confusion_matrix(y_test,y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred,zero_division=1))
    print("\n\n")

    return y_pred

**Split the data**

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, stratify=y_labeled, random_state=42)

print(len(X_train), len(y_train), len(X_test), len(y_test))

501 501 126 126


**Supervised SGDClassifier on the labeled data**

In [129]:
print("Supervised SGDClassifier on the labeled data:")
y_pred = eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)
len(y_pred)

Supervised SGDClassifier on the labeled data:
Number of training samples: 501
Unlabeled samples in training set: 0
y Predict ['surprise' 'joy' 'surprise' 'surprise' 'neutral' 'joy' 'surprise'
 'surprise' 'disgust' 'surprise' 'sadness' 'surprise' 'joy' 'joy' 'joy'
 'disgust' 'anger' 'sadness' 'neutral' 'sadness' 'sadness' 'surprise'
 'joy' 'neutral' 'sadness' 'disgust' 'sadness' 'disgust' 'surprise'
 'anger' 'disgust' 'neutral' 'disgust' 'neutral' 'sadness' 'sadness'
 'sadness' 'surprise' 'disgust' 'surprise' 'sadness' 'sadness' 'disgust'
 'fear' 'sadness' 'surprise' 'disgust' 'neutral' 'sadness' 'surprise'
 'sadness' 'surprise' 'fear' 'sadness' 'fear' 'neutral' 'disgust'
 'neutral' 'surprise' 'joy' 'fear' 'surprise' 'neutral' 'joy' 'sadness'
 'surprise' 'joy' 'neutral' 'disgust' 'surprise' 'surprise' 'surprise'
 'surprise' 'anger' 'neutral' 'fear' 'surprise' 'disgust' 'surprise'
 'sadness' 'surprise' 'fear' 'disgust' 'fear' 'neutral' 'sadness' 'joy'
 'neutral' 'joy' 'joy' 'disgust' 'su

126

**Self Training Classifier on the labeled data**

In [130]:
print("Self Training Classifier on the labeled data:")
eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)

Self Training Classifier on the labeled data:
Number of training samples: 501
Unlabeled samples in training set: 0




y Predict ['surprise' 'joy' 'surprise' 'surprise' 'neutral' 'joy' 'surprise'
 'surprise' 'disgust' 'surprise' 'sadness' 'surprise' 'joy' 'joy' 'joy'
 'disgust' 'anger' 'sadness' 'neutral' 'sadness' 'sadness' 'surprise'
 'joy' 'neutral' 'sadness' 'disgust' 'sadness' 'disgust' 'surprise'
 'anger' 'disgust' 'neutral' 'disgust' 'neutral' 'sadness' 'sadness'
 'sadness' 'surprise' 'disgust' 'surprise' 'sadness' 'sadness' 'disgust'
 'fear' 'sadness' 'surprise' 'disgust' 'neutral' 'sadness' 'surprise'
 'sadness' 'surprise' 'fear' 'sadness' 'fear' 'neutral' 'disgust'
 'neutral' 'surprise' 'joy' 'fear' 'surprise' 'neutral' 'joy' 'sadness'
 'surprise' 'joy' 'neutral' 'disgust' 'surprise' 'surprise' 'surprise'
 'surprise' 'anger' 'neutral' 'fear' 'surprise' 'disgust' 'surprise'
 'sadness' 'surprise' 'fear' 'disgust' 'fear' 'neutral' 'sadness' 'joy'
 'neutral' 'joy' 'joy' 'disgust' 'surprise' 'joy' 'sadness' 'joy'
 'sadness' 'surprise' 'sadness' 'joy' 'fear' 'sadness' 'neutral'
 'surprise' 'joy' 'n

array(['surprise', 'joy', 'surprise', 'surprise', 'neutral', 'joy',
       'surprise', 'surprise', 'disgust', 'surprise', 'sadness',
       'surprise', 'joy', 'joy', 'joy', 'disgust', 'anger', 'sadness',
       'neutral', 'sadness', 'sadness', 'surprise', 'joy', 'neutral',
       'sadness', 'disgust', 'sadness', 'disgust', 'surprise', 'anger',
       'disgust', 'neutral', 'disgust', 'neutral', 'sadness', 'sadness',
       'sadness', 'surprise', 'disgust', 'surprise', 'sadness', 'sadness',
       'disgust', 'fear', 'sadness', 'surprise', 'disgust', 'neutral',
       'sadness', 'surprise', 'sadness', 'surprise', 'fear', 'sadness',
       'fear', 'neutral', 'disgust', 'neutral', 'surprise', 'joy', 'fear',
       'surprise', 'neutral', 'joy', 'sadness', 'surprise', 'joy',
       'neutral', 'disgust', 'surprise', 'surprise', 'surprise',
       'surprise', 'anger', 'neutral', 'fear', 'surprise', 'disgust',
       'surprise', 'sadness', 'surprise', 'fear', 'disgust', 'fear',
       'neutral',

**Self Training Classifier on the labeled and unlabeled data**

**Manage Labeled and Unlabeled Data**

In [131]:
test_indices = X_test.index
#print("TEST INDICES",test_indices)

# Exclude test data from X_labeled and y_labeled based on the identified indices
X_labeled_filtered = X_labeled.drop(index=test_indices, errors='ignore')
y_labeled_filtered = y_labeled.drop(index=test_indices, errors='ignore')

# Concatenate the filtered labeled data with the unlabeled data
X=X_combined = pd.concat([X_labeled_filtered, X_unlabeled])
y=y_combined = pd.concat([y_labeled_filtered, y_unlabeled])


**Mapping Labels**

In [132]:
# Define the mapping for labels
# label_mapping = {'Positive': 1, 'Negative': 0, -1:-1 }
label_mapping = {
    'anger': 1,
    'joy': 2,
    'sadness': 3,
    'fear': 4,
    'disgust': 5,
    'surprise': 6,
    'neutral': 7,
    -1:-1
}
# Apply the mapping to labels
y  = [label_mapping[label] for label in y]
#print(y)
y_test  = [label_mapping[label] for label in y_test]
#print(y_test)

In [133]:
print("Self Training Classifier on the labeled and unlabeled data:")
y_pred = eval_and_print_metrics(st_pipeline, X, y, X_test, y_test)

Self Training Classifier on the labeled and unlabeled data:
Number of training samples: 5596
Unlabeled samples in training set: 5095
End of iteration 1, added 168 new labels.
End of iteration 2, added 404 new labels.
End of iteration 3, added 1090 new labels.
End of iteration 4, added 973 new labels.
End of iteration 5, added 406 new labels.
End of iteration 6, added 183 new labels.
End of iteration 7, added 126 new labels.
End of iteration 8, added 69 new labels.
End of iteration 9, added 57 new labels.
End of iteration 10, added 51 new labels.
y Predict [4 2 7 7 7 2 2 7 5 6 7 7 2 2 2 4 1 7 7 4 4 7 2 7 4 4 4 5 7 1 1 7 1 7 7 2 7
 6 4 7 4 4 7 4 4 7 7 7 5 7 4 7 4 2 4 7 5 7 4 2 4 4 7 7 4 6 2 7 5 4 4 4 7 1
 7 7 7 7 7 4 4 7 5 7 7 7 2 7 2 7 5 4 2 3 2 7 2 7 2 4 3 7 4 2 7 5 5 4 5 7 7
 4 7 1 4 7 7 2 7 7 6 7 6 2 2 5]
y Test [6, 2, 6, 6, 7, 3, 6, 7, 5, 6, 1, 1, 2, 2, 2, 6, 5, 1, 7, 7, 6, 7, 2, 3, 2, 5, 4, 5, 6, 1, 5, 7, 3, 7, 4, 2, 5, 6, 4, 5, 4, 3, 7, 6, 3, 7, 7, 7, 5, 6, 3, 2, 3, 2, 4, 7, 5, 7,