In [6]:
import re
import numpy as np
import pandas as pd
import gensim
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from lazypredict.Supervised import LazyClassifier

In [7]:
data=pd.read_csv(r'D:\CODING\Python\NLP\END_SEM\hindi_data.csv')

In [8]:
data.head()

Unnamed: 0,text,sentiment
0,लोग वतन तक खा जाते हैं इसका इसे यकीन नहींमान ज...,negative
1,गुमनाम है वतन पर मिटने वाले लोग आतन्कवादियों स...,negative
2,ज़ंजीर बदली जा रही थी मैं समझा था रिहाई हो गयी है,negative
3,यूपी में बड़े स्तर पर दंगे करवा सकती है बीजेपी...,negative
4,अंग्रेजी नहीं आती है इसलिए हिन्दी ट्विट ज्यादा...,negative


In [9]:
data.dropna(inplace=True)

In [10]:
def preprocess_and_clean_hindi(text):
    # Define the function to preprocess Hindi text by tokenizing
    def preprocess_text_hindi(text):
        tokens = text.split(" ")
        return tokens

    # Define the function to remove non-Hindi characters while preserving spaces
    def remove_non_hindi(text):
        hindi_pattern = re.compile("[\u0900-\u097F\s]+")  # Unicode range for Hindi characters and space
        hindi_text = hindi_pattern.findall(text)
        cleaned_text = ''.join(hindi_text)
        return cleaned_text

    # Tokenize the text
    tokens = preprocess_text_hindi(text)

    # Remove non-Hindi characters while preserving spaces
    cleaned_text = ' '.join([remove_non_hindi(token) for token in tokens])

    return cleaned_text

data['text'] = data['text'].apply(preprocess_and_clean_hindi)

data['label'] = data['sentiment'].map({'positive': 1, 'negative': 0,'neutral': 2})


In [11]:
data = data[data['label'] != 2]

In [12]:
def generate_embedding_vector(word_vector, word2vec_model):
    word_embeddings = []
    for word in word_vector:
        if word in word2vec_model.wv:
            word_embedding = word2vec_model.wv[word]
            word_embeddings.append(word_embedding)

    embedding_vector = np.sum(word_embeddings, axis=0)
    return embedding_vector


In [13]:
tokenized_corpus=data['text'].to_list()

In [14]:
word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

In [15]:
data['embedding_vector'] = data['text'].apply(lambda x: generate_embedding_vector(x, word2vec_model))

In [16]:
X = np.array(data['embedding_vector'].tolist())
y = np.array(data['label'].tolist())

del data
del tokenized_corpus

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)


 97%|█████████▋| 28/29 [00:08<00:00,  4.34it/s]

[LightGBM] [Info] Number of positive: 918, number of negative: 746
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25485
[LightGBM] [Info] Number of data points in the train set: 1664, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.551683 -> initscore=0.207472
[LightGBM] [Info] Start training from score 0.207472


100%|██████████| 29/29 [00:08<00:00,  3.33it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LinearSVC                          0.63               0.63     0.63      0.63   
BaggingClassifier                  0.63               0.63     0.63      0.63   
LGBMClassifier                     0.62               0.62     0.62      0.62   
RidgeClassifier                    0.62               0.62     0.62      0.62   
LogisticRegression                 0.62               0.62     0.62      0.62   
LinearDiscriminantAnalysis         0.62               0.62     0.62      0.61   
RandomForestClassifier             0.62               0.62     0.62      0.61   
NuSVC                              0.61               0.61     0.61      0.61   
SVC                                0.61               0.61     0.61      0.60   
ExtraTreesClassifier               0.61               0.61     0.61      0.61   
RidgeClassifierCV           




In [25]:
# Get the best model from the list of models

best_model = models.iloc[0]
best_model

Accuracy            0.63
Balanced Accuracy   0.63
ROC AUC             0.63
F1 Score            0.63
Time Taken          0.31
Name: LinearSVC, dtype: float64