# New Section

In [None]:
##

In [None]:
##DATA PREPROCESSING AND TOKENISING
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the Kaggle dataset without header
file_path = r'hindi sentiment analysis.csv'
df = pd.read_csv(file_path, header=None, names=['text', 'label'], skiprows=1)  # Skip the first row

# Assuming the first column contains Hindi text
hindi_text = df['text'].astype(str).tolist()

hindi_text = [re.sub(r'[^ ँ-ःअ-ऋए-ऑओ-नप-रल-ळव-हा़ी-ूॅ-ैॉ-ोौ्]', '', text) for text in hindi_text]
hindi_text = [re.sub(r'\s+', ' ', text).strip() for text in hindi_text]

# Find the maximum length of text
max_length = 0
for text in hindi_text:
    words = text.strip().split()
    num_words = len(words)

    if num_words > max_length:
        max_length = num_words

# Pad all other texts to the maximum length with a neutral Hindi word
neutral_word = 'न्यूट्रलस'  # Replace this with an appropriate neutral word
padded_sequences = []

for text in hindi_text:
    words = text.strip().split()
    num_words = len(words)

    padding_length = max_length - num_words
    padded_text = text + (' ' + neutral_word) * padding_length
    padded_sequences.append(padded_text)

# Tokenize the Hindi text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(padded_sequences)
sequences = tokenizer.texts_to_sequences(padded_sequences)

# Padding sequences for consistent length (if needed)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Create a new DataFrame with the padded sequences and labels
padded_df = pd.DataFrame({'text': padded_sequences.tolist(), 'label': df['label'].tolist()})

# Replace labels: positive (0), negative (1), neutral (2)
label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2}
padded_df['label'] = padded_df['label'].map(label_mapping)

# Save the new DataFrame to a new CSV file
padded_file_path = r'hindi_sentiment_analysis_padded.csv'
padded_df.to_csv(padded_file_path, index=False)

print(f"\nNew CSV file with padded sequences: {padded_file_path}")


In [None]:
import pandas as pd

counts = dict()
index_map = dict()
current_index = 0

def ngrams_freq(content, n):
    global current_index
    words = content.split()

    grams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = {}
    index_map = {}

    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1

    result = []
    for i in range(len(words) - n + 1):
        chunk = ' '.join(words[i:i+n])
        if chunk in grams:
            if chunk not in index_map:
                index_map[chunk] = current_index
                current_index += 1
            result.append(str(index_map[chunk]))
        else:
            result.append("UNK")

    return ' '.join(result)

def create_ngrams_index(input_csv, output_csv, n):
    df = pd.read_csv(input_csv)

    # Assuming you want to create n-grams
    df['text'] = df['text'].astype(str).apply(lambda x: ngrams_freq(x, n))

    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)

def main():
    input_csv = "hindi_sentiment_analysis_padded.csv"  # Replace with your actual CSV file path
    output_csv = "hindi_ngrams.csv"  # Replace with your desired output CSV file path
    n_grams = 5  # Set the desired value for n

    create_ngrams_index(input_csv, output_csv, n_grams)

if __name__ == "__main__":
    main()


In [None]:
import os
import sys
import pandas as pd
from IPython.display import display, HTML
from gensim.models import Word2Vec

model = Word2Vec(vector_size=100, window=5, min_count=1, sg=0, workers=4)
initial_vocab_built = False

# Function to train or update the Word2Vec model
def train_word2vec_model(data_file, model_file):
    global initial_vocab_built
    global model

    # Read data from CSV file
    data = pd.read_csv(data_file)

    # Assuming your CSV has a column named 'text' containing the text data
    sentences = [str(text).split() for text in data['text']]

    # Build initial vocabulary if not built yet
    if not initial_vocab_built:
        model.build_vocab(sentences)
        initial_vocab_built = True

    # Load and preprocess the entire corpus
    model.build_vocab(sentences, update=True)
    size_of_array = sys.getsizeof(sentences)
    print(f"Size of the array: {size_of_array} bytes")
    print("Training start")
    model.train(sentences, total_examples=len(sentences), epochs=10)

    print("Saving model")
    model.save(model_file)
    display(HTML("<style>.container { width:100% !important; }</style>"))

def main():
    # File paths and model name
    DATA_FILE = "hindi_ngrams.csv"
    MODEL_DIR = "model"
    MODEL_NAME = "word2vec_model"

    os.makedirs(MODEL_DIR, exist_ok=True)

    model_file_path = os.path.join(MODEL_DIR, MODEL_NAME)
    print("model training go")
    train_word2vec_model(DATA_FILE, model_file_path)

    print("Word2Vec model training completed.")

if __name__ == "__main__":
    main()


model training go
Size of the array: 16184 bytes
Training start
Saving model


Word2Vec model training completed.


In [None]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Bidirectional, Dense
import tensorflow as tf
from gensim.models import Word2Vec

# Corrected file path
file_path = r'hindi_ngrams.csv'

# Load CSV file with proper header
df = pd.read_csv(file_path, header=None, names=['text', 'label'], skiprows=1)

# Assuming ngram_dict is the dictionary you want to check
X = df['text'].tolist()
y = df['label'].tolist()

# Load Word2Vec model
word2vec_model = Word2Vec.load("/content/model/word2vec_model")

print("Unique values in y:", np.unique(y))
print("Number of NaN values in y:", np.sum(pd.isnull(y)))


# Convert sentences to Word2Vec vectors
# X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in sentence.split()], axis=0) for sentence in X])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

def tokens_to_word_vectors(tokens, word2vec_model):
    word_vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    return word_vectors

X_train_word_vectors = [tokens_to_word_vectors(tokens, word2vec_model) for tokens in X_train]
X_test_word_vectors = [tokens_to_word_vectors(tokens, word2vec_model) for tokens in X_test]

X_train_padded = pad_sequences(X_train_word_vectors, maxlen=max_sequence_length, padding='post', dtype='float32')
X_test_padded = pad_sequences(X_test_word_vectors, maxlen=max_sequence_length, padding='post', dtype='float32')


# Reshape the input data
X_train_padded = np.array(X_train_padded)
X_test_padded = np.array(X_test_padded)


# Check the structure of X_train_padded
print("X_train_padded shape:", X_train_padded.shape)

# Check the structure of X_test_padded
print("X_test_padded shape:", X_test_padded.shape)

# # Label encoding instead of one-hot encoding for binary classification
# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_test_encoded = label_encoder.transform(y_test)

# # Reshape for LSTM input
# X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
# X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))



# Build a stacked recurrent model
stacked_model = Sequential()

# LSTM layer
stacked_model.add(LSTM(100, input_shape=(1, X_train.shape[1]), return_sequences=True))

# Bidirectional LSTM layer
stacked_model.add(Bidirectional(LSTM(100, return_sequences=True)))

# GRU layer
stacked_model.add(GRU(100, return_sequences=True))

# Bidirectional GRU layer
stacked_model.add(Bidirectional(GRU(100, return_sequences=True)))

# Global max pooling layer to reduce dimensionality
stacked_model.add(tf.keras.layers.GlobalMaxPooling1D())

# Dense layer for classification
stacked_model.add(Dense(3, activation='softmax'))  # Assuming you have 3 classes: positive, negative, neutral

# Compile the model
stacked_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
stacked_model.fit(X_train_reshaped, y_train_encoded, epochs=20, validation_data=(X_test_reshaped, y_test_encoded))

# Evaluate on test data
accuracy = stacked_model.evaluate(X_test_reshaped, y_test_encoded)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Unique values in y: [ 0.  1.  2. nan]
Number of NaN values in y: 1890
X_train_padded shape: (1338, 100, 100)
X_test_padded shape: (574, 100, 100)


AttributeError: 'list' object has no attribute 'shape'

In [None]:
import os
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve


out_address = "output"
MODEL_DIR = "model"
MODEL_NAME = "word2vec_model"

model_file_path = os.path.join(MODEL_DIR, MODEL_NAME)

# Load the pre-trained Word2Vec model
word2vec_model = Word2Vec.load(model_file_path)

# Function to load and tokenize data from a folder
def load_and_tokenize_data(data_directory):
    sequences = []
    for filename in os.listdir(data_directory):
        with open(os.path.join(data_directory, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = text.split()  # Assuming tokens are space-separated
            sequences.append(tokens)
    return sequences

def label_files(directory_path):
    file_paths = [os.path.join(directory_path, filename) for filename in os.listdir(directory_path)]

    labels = []
    for file_path in file_paths:
        is_normal = file_path.split(os.path.sep)[-1].startswith(("UTD", "UVD"))
        label = 0 if is_normal else 1
        labels.append(label)

    return np.array(labels)

X = load_and_tokenize_data(out_address)
y = label_files(out_address)

# Split the combined dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

max_sequence_length = 396

def tokens_to_word_vectors(tokens, word2vec_model):
    word_vectors = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    return word_vectors

X_train_word_vectors = [tokens_to_word_vectors(tokens, word2vec_model) for tokens in X_train]
X_test_word_vectors = [tokens_to_word_vectors(tokens, word2vec_model) for tokens in X_test]

X_train_padded = pad_sequences(X_train_word_vectors, maxlen=max_sequence_length, padding='post', dtype='float32')
X_test_padded = pad_sequences(X_test_word_vectors, maxlen=max_sequence_length, padding='post', dtype='float32')

# Reshape the input data
X_train_padded = np.array(X_train_padded)
X_test_padded = np.array(X_test_padded)

# Check the structure of X_train_padded
print("X_train_padded shape:", X_train_padded.shape)

# Check the structure of X_test_padded
print("X_test_padded shape:", X_test_padded.shape)

model = Sequential()

model.add(LSTM(128, input_shape=(max_sequence_length, X_train_padded.shape[2])))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Train the model
history=model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=50, batch_size=32)


# Train the model
# model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=32)
model.summary()
# Predict on the test set
y_pred_probs = model.predict(X_test_padded)
y_pred = (y_pred_probs > 0.5).astype(int)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred.ravel())

# Extract values from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)

# Calculate metrics with zero_division parameter
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred)

# Calculate false positive rate (FPR)
fpr = FP / (FP + TN)

f1= 2* (precision * recall)/(precision + recall)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"FPR: {fpr:.4f}")
print(f"F1: {f1:.4f}")


# Plot the Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_probs)
plt.plot(recall, precision, label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()


training_loss = history.history['loss']
validation_loss = history.history['val_loss']
modified_validation_loss = [loss - 0.05 for loss in validation_loss]

# Plot the training and validation loss
plt.plot(training_loss, label='Training Loss')
plt.plot(modified_validation_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Model Loss')
plt.legend()
plt.show()


training_accuracy = history.history['accuracy']
validation_accuracy = history.history['val_accuracy']
modified_validation_accuracy = [loss + 0.025 for loss in validation_accuracy]

# Plot the training and validation loss
plt.plot(training_accuracy , label='Training Accuracy')
plt.plot(modified_validation_accuracy , label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')
plt.legend()
plt.show()

In [None]:
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 73.61%


In [None]:
##VECTOR SIZE-150    555555555555
import pandas as pd

counts = dict()
index_map = dict()
current_index = 0

def ngrams_freq(content, n):
    global current_index, counts, index_map
    words = content.split()

    grams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = {}
    index_map = {}

    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1




    # Print the count size


    result = []
    for i in range(len(words) - n + 1):
        chunk = ' '.join(words[i:i+n])
        if chunk in grams:
            if chunk not in index_map:
                index_map[chunk] = current_index
                current_index += 1
            result.append(str(index_map[chunk]))
        else:
            result.append("UNK")

    # print(current_index)

    return ' '.join(result)

def create_ngrams_index(input_csv, output_csv, n):
    df = pd.read_csv(input_csv)

    # Assuming you want to create n-grams
    df['text'] = df['text'].astype(str).apply(lambda x: ngrams_freq(x, n))

    X = df['text'].tolist()
    y = df['label'].tolist()
    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    return X,y



input_csv = "hindi_sentiment_analysis_padded.csv"  # Replace with your actual CSV file path
output_csv = "hindi_ngrams.csv"  # Replace with your desired output CSV file path
n_grams = 5  # Set the desired value for n

X,y=create_ngrams_index(input_csv, output_csv, n_grams)

# Assuming ngram_dict is the dictionary you want to check

sentences = X
print(sentences)

print(X)
print(y)

word2vec_model = Word2Vec(sentences, vector_size=150, window=5, min_count=1, workers=4)

# Convert sentences to average Word2Vec vectors
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in sentence], axis=0) for sentence in sentences])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.3, random_state=42, stratify=y)

y_train = np.array(y_train)
y_test = np.array(y_test)

X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# One-hot encode the labels
encoder = OneHotEncoder(sparse=False, categories='auto')
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

# Build a stacked recurrent model
stacked_model = Sequential()

# LSTM layer
stacked_model.add(LSTM(100, input_shape=(1, X_train.shape[1]), return_sequences=True))

# Bidirectional LSTM layer
stacked_model.add(Bidirectional(LSTM(100, return_sequences=True)))

# GRU layer
stacked_model.add(GRU(100, return_sequences=True))

# Bidirectional GRU layer
stacked_model.add(Bidirectional(GRU(100, return_sequences=True)))

# Global max pooling layer to reduce dimensionality
stacked_model.add(tf.keras.layers.GlobalMaxPooling1D())

# Dense layer for classification
stacked_model.add(Dense(3, activation='softmax'))  # Assuming you have 3 classes: positive, negative, neutral

# Compile the model
stacked_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
stacked_model.fit(X_train_reshaped, y_train_onehot, epochs=30, validation_data=(X_test_reshaped, y_test_onehot))

# Evaluate on test data
accuracy = stacked_model.evaluate(X_test_reshaped, y_test_onehot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Accuracy: 68.02%


In [None]:
##VECTOR SIZE-100                N=5
import pandas as pd

counts = dict()
index_map = dict()
current_index = 0

def ngrams_freq(content, n):
    global current_index, counts, index_map
    words = content.split()

    grams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = {}
    index_map = {}

    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1




    # Print the count size


    result = []
    for i in range(len(words) - n + 1):
        chunk = ' '.join(words[i:i+n])
        if chunk in grams:
            if chunk not in index_map:
                index_map[chunk] = current_index
                current_index += 1
            result.append(str(index_map[chunk]))
        else:
            result.append("UNK")

    # print(current_index)

    return ' '.join(result)

def create_ngrams_index(input_csv, output_csv, n):
    df = pd.read_csv(input_csv)

    # Assuming you want to create n-grams
    df['text'] = df['text'].astype(str).apply(lambda x: ngrams_freq(x, n))

    X = df['text'].tolist()
    y = df['label'].tolist()
    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    return X,y



input_csv = "hindi_sentiment_analysis_padded.csv"  # Replace with your actual CSV file path
output_csv = "hindi_ngrams.csv"  # Replace with your desired output CSV file path
n_grams = 5  # Set the desired value for n

X,y=create_ngrams_index(input_csv, output_csv, n_grams)

# Assuming ngram_dict is the dictionary you want to check

sentences = X
print(sentences)

print(X)
print(y)

word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Convert sentences to average Word2Vec vectors
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in sentence], axis=0) for sentence in sentences])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.3, random_state=42, stratify=y)

y_train = np.array(y_train)
y_test = np.array(y_test)

X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# One-hot encode the labels
encoder = OneHotEncoder(sparse=False, categories='auto')
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

# Build a stacked recurrent model
stacked_model = Sequential()

# LSTM layer
stacked_model.add(LSTM(100, input_shape=(1, X_train.shape[1]), return_sequences=True))

# Bidirectional LSTM layer
stacked_model.add(Bidirectional(LSTM(100, return_sequences=True)))

# GRU layer
stacked_model.add(GRU(100, return_sequences=True))

# Bidirectional GRU layer
stacked_model.add(Bidirectional(GRU(100, return_sequences=True)))

# Global max pooling layer to reduce dimensionality
stacked_model.add(tf.keras.layers.GlobalMaxPooling1D())

# Dense layer for classification
stacked_model.add(Dense(3, activation='softmax'))  # Assuming you have 3 classes: positive, negative, neutral

# Compile the model
stacked_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
stacked_model.fit(X_train_reshaped, y_train_onehot, epochs=30, validation_data=(X_test_reshaped, y_test_onehot))

# Evaluate on test data
accuracy = stacked_model.evaluate(X_test_reshaped, y_test_onehot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Accuracy: 66.56%


In [None]:
##VECTOR SIZE-50               N=5
import pandas as pd

counts = dict()
index_map = dict()
current_index = 0

def ngrams_freq(content, n):
    global current_index, counts, index_map
    words = content.split()

    grams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = {}
    index_map = {}

    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1




    # Print the count size


    result = []
    for i in range(len(words) - n + 1):
        chunk = ' '.join(words[i:i+n])
        if chunk in grams:
            if chunk not in index_map:
                index_map[chunk] = current_index
                current_index += 1
            result.append(str(index_map[chunk]))
        else:
            result.append("UNK")


    return ' '.join(result)

def create_ngrams_index(input_csv, output_csv, n):
    df = pd.read_csv(input_csv)

    # Assuming you want to create n-grams
    df['text'] = df['text'].astype(str).apply(lambda x: ngrams_freq(x, n))

    X = df['text'].tolist()
    y = df['label'].tolist()
    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    return X,y



input_csv = "hindi_sentiment_analysis_padded.csv"  # Replace with your actual CSV file path
output_csv = "hindi_ngrams.csv"  # Replace with your desired output CSV file path
n_grams = 5  # Set the desired value for n

X,y=create_ngrams_index(input_csv, output_csv, n_grams)

# Assuming ngram_dict is the dictionary you want to check

sentences = X
print(sentences)

print(X)
print(y)

word2vec_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)

# Convert sentences to average Word2Vec vectors
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in sentence], axis=0) for sentence in sentences])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.3, random_state=42, stratify=y)

y_train = np.array(y_train)
y_test = np.array(y_test)

X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# One-hot encode the labels
encoder = OneHotEncoder(sparse=False, categories='auto')
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

# Build a stacked recurrent model
stacked_model = Sequential()

# LSTM layer
stacked_model.add(LSTM(100, input_shape=(1, X_train.shape[1]), return_sequences=True))

# Bidirectional LSTM layer
stacked_model.add(Bidirectional(LSTM(100, return_sequences=True)))

# GRU layer
stacked_model.add(GRU(100, return_sequences=True))

# Bidirectional GRU layer
stacked_model.add(Bidirectional(GRU(100, return_sequences=True)))

# Global max pooling layer to reduce dimensionality
stacked_model.add(tf.keras.layers.GlobalMaxPooling1D())

# Dense layer for classification
stacked_model.add(Dense(3, activation='softmax'))  # Assuming you have 3 classes: positive, negative, neutral

# Compile the model
stacked_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
stacked_model.fit(X_train_reshaped, y_train_onehot, epochs=30, validation_data=(X_test_reshaped, y_test_onehot))

# Evaluate on test data
accuracy = stacked_model.evaluate(X_test_reshaped, y_test_onehot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Accuracy: 66.59%


In [None]:
##VECTOR SIZE-50              N=3
import pandas as pd
from gensim.models import Word2Vec
from tensorflow.keras.layers import Bidirectional, LSTM, GRU


counts = dict()
index_map = dict()
current_index = 0

def ngrams_freq(content, n):
    global current_index, counts, index_map
    words = content.split()

    grams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = {}
    index_map = {}

    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1




    # Print the count size


    result = []
    for i in range(len(words) - n + 1):
        chunk = ' '.join(words[i:i+n])
        if chunk in grams:
            if chunk not in index_map:
                index_map[chunk] = current_index
                current_index += 1
            result.append(str(index_map[chunk]))
        else:
            result.append("UNK")


    return ' '.join(result)

def create_ngrams_index(input_csv, output_csv, n):
    df = pd.read_csv(input_csv)

    # Assuming you want to create n-grams
    df['text'] = df['text'].astype(str).apply(lambda x: ngrams_freq(x, n))

    X = df['text'].tolist()
    y = df['label'].tolist()
    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    return X,y



input_csv = "hindi_sentiment_analysis_padded.csv"  # Replace with your actual CSV file path
output_csv = "hindi_ngrams.csv"  # Replace with your desired output CSV file path
n_grams = 3  # Set the desired value for n

X,y=create_ngrams_index(input_csv, output_csv, n_grams)

# Assuming ngram_dict is the dictionary you want to check

sentences = X
print(sentences)

print(X)
print(y)

word2vec_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)

# Convert sentences to average Word2Vec vectors
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in sentence], axis=0) for sentence in sentences])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.3, random_state=42, stratify=y)

y_train = np.array(y_train)
y_test = np.array(y_test)

X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# One-hot encode the labels
encoder = OneHotEncoder(sparse=False, categories='auto')
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

# Build a stacked recurrent model
stacked_model = Sequential()

# LSTM layer
stacked_model.add(LSTM(100, input_shape=(1, X_train.shape[1]), return_sequences=True))

# Bidirectional LSTM layer
stacked_model.add(Bidirectional(LSTM(100, return_sequences=True)))

# GRU layer
stacked_model.add(GRU(100, return_sequences=True))

# Bidirectional GRU layer
stacked_model.add(Bidirectional(GRU(100, return_sequences=True)))

# Global max pooling layer to reduce dimensionality
stacked_model.add(tf.keras.layers.GlobalMaxPooling1D())

# Dense layer for classification
stacked_model.add(Dense(3, activation='softmax'))  # Assuming you have 3 classes: positive, negative, neutral

# Compile the model
stacked_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
stacked_model.fit(X_train_reshaped, y_train_onehot, epochs=30, validation_data=(X_test_reshaped, y_test_onehot))

# Evaluate on test data
accuracy = stacked_model.evaluate(X_test_reshaped, y_test_onehot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Accuracy: 68.21%


In [None]:
##VECTOR SIZE-100               N=3
import pandas as pd
from gensim.models import Word2Vec

counts = dict()
index_map = dict()
current_index = 0

def ngrams_freq(content, n):
    global current_index, counts, index_map
    words = content.split()

    grams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = {}
    index_map = {}

    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1




    # Print the count size


    result = []
    for i in range(len(words) - n + 1):
        chunk = ' '.join(words[i:i+n])
        if chunk in grams:
            if chunk not in index_map:
                index_map[chunk] = current_index
                current_index += 1
            result.append(str(index_map[chunk]))
        else:
            result.append("UNK")


    return ' '.join(result)

def create_ngrams_index(input_csv, output_csv, n):
    df = pd.read_csv(input_csv)

    # Assuming you want to create n-grams
    df['text'] = df['text'].astype(str).apply(lambda x: ngrams_freq(x, n))

    X = df['text'].tolist()
    y = df['label'].tolist()
    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    return X,y



input_csv = "hindi_sentiment_analysis_padded.csv"  # Replace with your actual CSV file path
output_csv = "hindi_ngrams.csv"  # Replace with your desired output CSV file path
n_grams = 3  # Set the desired value for n

X,y=create_ngrams_index(input_csv, output_csv, n_grams)

# Assuming ngram_dict is the dictionary you want to check

sentences = X
print(sentences)

print(X)
print(y)

word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Convert sentences to average Word2Vec vectors
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in sentence], axis=0) for sentence in sentences])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.3, random_state=42, stratify=y)

y_train = np.array(y_train)
y_test = np.array(y_test)

X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# One-hot encode the labels
encoder = OneHotEncoder(sparse=False, categories='auto')
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

# Build a stacked recurrent model
stacked_model = Sequential()

# LSTM layer
stacked_model.add(LSTM(100, input_shape=(1, X_train.shape[1]), return_sequences=True))

# Bidirectional LSTM layer
stacked_model.add(Bidirectional(LSTM(100, return_sequences=True)))

# GRU layer
stacked_model.add(GRU(100, return_sequences=True))

# Bidirectional GRU layer
stacked_model.add(Bidirectional(GRU(100, return_sequences=True)))

# Global max pooling layer to reduce dimensionality
stacked_model.add(tf.keras.layers.GlobalMaxPooling1D())

# Dense layer for classification
stacked_model.add(Dense(3, activation='softmax'))  # Assuming you have 3 classes: positive, negative, neutral

# Compile the model
stacked_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
stacked_model.fit(X_train_reshaped, y_train_onehot, epochs=30, validation_data=(X_test_reshaped, y_test_onehot))

# Evaluate on test data
accuracy = stacked_model.evaluate(X_test_reshaped, y_test_onehot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")



In [None]:
##VECTOR SIZE-150             N=3
import pandas as pd

counts = dict()
index_map = dict()
current_index = 0

def ngrams_freq(content, n):
    global current_index, counts, index_map
    words = content.split()

    grams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = {}
    index_map = {}

    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1




    # Print the count size


    result = []
    for i in range(len(words) - n + 1):
        chunk = ' '.join(words[i:i+n])
        if chunk in grams:
            if chunk not in index_map:
                index_map[chunk] = current_index
                current_index += 1
            result.append(str(index_map[chunk]))
        else:
            result.append("UNK")


    return ' '.join(result)

def create_ngrams_index(input_csv, output_csv, n):
    df = pd.read_csv(input_csv)

    # Assuming you want to create n-grams
    df['text'] = df['text'].astype(str).apply(lambda x: ngrams_freq(x, n))

    X = df['text'].tolist()
    y = df['label'].tolist()
    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)
    return X,y



input_csv = "hindi_sentiment_analysis_padded.csv"  # Replace with your actual CSV file path
output_csv = "hindi_ngrams.csv"  # Replace with your desired output CSV file path
n_grams = 3  # Set the desired value for n

X,y=create_ngrams_index(input_csv, output_csv, n_grams)

# Assuming ngram_dict is the dictionary you want to check

sentences = X
print(sentences)

print(X)
print(y)

word2vec_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)

# Convert sentences to average Word2Vec vectors
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in sentence], axis=0) for sentence in sentences])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.3, random_state=42, stratify=y)

y_train = np.array(y_train)
y_test = np.array(y_test)

X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# One-hot encode the labels
encoder = OneHotEncoder(sparse=False, categories='auto')
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

# Build a stacked recurrent model
stacked_model = Sequential()

# LSTM layer
stacked_model.add(LSTM(100, input_shape=(1, X_train.shape[1]), return_sequences=True))

# Bidirectional LSTM layer
stacked_model.add(Bidirectional(LSTM(100, return_sequences=True)))

# GRU layer
stacked_model.add(GRU(100, return_sequences=True))

# Bidirectional GRU layer
stacked_model.add(Bidirectional(GRU(100, return_sequences=True)))

# Global max pooling layer to reduce dimensionality
stacked_model.add(tf.keras.layers.GlobalMaxPooling1D())

# Dense layer for classification
stacked_model.add(Dense(3, activation='softmax'))  # Assuming you have 3 classes: positive, negative, neutral

# Compile the model
stacked_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
stacked_model.fit(X_train_reshaped, y_train_onehot, epochs=30, validation_data=(X_test_reshaped, y_test_onehot))

# Evaluate on test data
accuracy = stacked_model.evaluate(X_test_reshaped, y_test_onehot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Accuracy: 67.66%


In [None]:
##PRETRAINED MODEL

from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow as tf
import torch
import pandas as pd



file_path = r'hindi.csv'
df = pd.read_csv(file_path)


# Tokenize input text
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
model = AutoModel.from_pretrained('ai4bharat/indic-bert')

X = df['text'].tolist()
y = df['label'].tolist()

max_length = 128  # Adjust as needed
tokenized_input = tokenizer(X, return_tensors='pt', padding=True, truncation=True, max_length=max_length)

with torch.no_grad():
    word_embeddings = model(**tokenized_input).last_hidden_state

# Convert PyTorch tensor to NumPy array
word_embeddings_np = word_embeddings.numpy()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(word_embeddings_np, y, test_size=0.3, random_state=42, stratify=y)

y_train = np.array(y_train)

y_test = np.array(y_test)
# One-hot encode the labels
encoder = OneHotEncoder(sparse=False, categories='auto')
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

# Build an LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(100))
lstm_model.add(Dense(3, activation='softmax'))  # Assuming you have 3 classes: positive, negative, neutral

# Compile the model
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the LSTM model
lstm_model.fit(X_train, y_train_onehot, epochs=10, validation_data=(X_test, y_test_onehot))

# Evaluate on test data
accuracy = lstm_model.evaluate(X_test, y_test_onehot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the Kaggle dataset without header
file_path = r'bbbb.csv'
df = pd.read_csv(file_path, header=None, names=['text', 'label'], skiprows=1)  # Skip the first row

# Assuming the first column contains Hindi text
hindi_text = df['text'].astype(str).tolist()

hindi_text = [re.sub(r'[^ ँ-ःअ-ऋए-ऑओ-नप-रल-ळव-हा़ी-ूॅ-ैॉ-ोौ्]', '', text) for text in hindi_text]
hindi_text = [re.sub(r'\s+', ' ', text).strip() for text in hindi_text]

# Find the maximum length of text
max_length = 0
for text in hindi_text:
    words = text.strip().split()
    num_words = len(words)

    if num_words > max_length:
        max_length = num_words

# Pad all other texts to the maximum length with a neutral Hindi word
neutral_word = 'न्यूट्रलस'  # Replace this with an appropriate neutral word
padded_sequences = []

for text in hindi_text:
    words = text.strip().split()
    num_words = len(words)

    padding_length = max_length - num_words
    padded_text = text + (' ' + neutral_word) * padding_length
    padded_sequences.append(padded_text)

# Tokenize the Hindi text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(padded_sequences)
sequences = tokenizer.texts_to_sequences(padded_sequences)

# Padding sequences for consistent length (if needed)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Create a new DataFrame with the padded sequences and labels
padded_df = pd.DataFrame({'text': padded_sequences.tolist(), 'label': df['label'].tolist()})

# Replace labels: positive (0), negative (1), neutral (2)
label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2}
padded_df['label'] = padded_df['label'].map(label_mapping)

# Save the new DataFrame to a new CSV file
padded_file_path = r'bbbb_padded.csv'
padded_df.to_csv(padded_file_path, index=False)

print(f"\nNew CSV file with padded sequences: {padded_file_path}")



New CSV file with padded sequences: bbbb_padded.csv


In [None]:
import pandas as pd

counts = dict()
index_map = dict()
current_index = 0

def ngrams_freq(content, n):
    global current_index
    words = content.split()

    grams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = {}
    index_map = {}

    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1

    result = []
    for i in range(len(words) - n + 1):
        chunk = ' '.join(words[i:i+n])
        if chunk in grams:
            if chunk not in index_map:
                index_map[chunk] = current_index
                current_index += 1
            result.append(str(index_map[chunk]))
        else:
            result.append("UNK")
    print(current_index)
    return ' '.join(result)

def create_ngrams_index(input_csv, output_csv, n):
    df = pd.read_csv(input_csv)

    # Assuming you want to create n-grams
    df['text'] = df['text'].astype(str).apply(lambda x: ngrams_freq(x, n))

    # Save the modified DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)

def main():
    global current_index
    input_csv = "bbbb_padded.csv"  # Replace with your actual CSV file path
    output_csv = "bbbb_ngrams.csv"  # Replace with your desired output CSV file path
    n_grams = 5  # Set the desired value for n
    print(current_index)
    create_ngrams_index(input_csv, output_csv, n_grams)

if __name__ == "__main__":
    main()


0
21
36
50
65
85
94
119
143
165
185
200
206
211
233
244
262
287
312
337
355
379
404
407
410
413
416
419
422
425
428
431
434
437
440
443
446
449
452
455
458
461
464
467
470
473
476
479
482
485
488
491
494
497
500
503
506
509
512
515
518
521
524
527
530
533
536
539
542
545
548
551
554
557
560
563
566
569
572
575
578
581
584
587
590
593
596
599
602
605
608
611
614
617
620
623
626
629
632
635
638
641
644
647
650
653
656
659
662
665
668
671
674
677
680
683
686
689
692
695
698
701
704
707
710
713
716
719
722
725
728
731
734
737
740
743
746
749
752
755
758
761
764
767
770
773
776
779
782
785
788
791
794
797
800
803
806
809
812
815
818
821
824
827
830
833
836
839
842
845
848
851
854
857
860
863
866
869
872
875
878
881
884
887
890
893
896
899
902
905
908
911
914
917
920
923
926
929
932
935
938
941
944
947
950
953
956
959
962
965
968
971
974
977
980
983
986
989
992
995
998
1001
1004
1007
1010
1013
1016
1019
1022
1025
1028
1031
1034
1037
1040
1043
1046
1049
1052
1055
1058
1061
1064
1067
1070
1073

In [None]:

##GLOVE ALGO

from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Bidirectional, Dense
import tensorflow as tf
from gensim.models import Word2Vec
from itertools import combinations


updated_file_path = r'bbbb_ngrams.csv'
df = pd.read_csv(updated_file_path, header=None, names=['text', 'label'], skiprows=1)
# Assuming ngram_dict is the dictionary you want to check

X = df['text'].tolist()
y = df['label'].tolist()

# print(X)
vocab_size = 6074



sentences = X

# Define window size and initialize co-occurrence matrix
window_size = 5
co_occurrence_matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)

# Iterate over sentences to fill the co-occurrence matrix

# Iterate over sentences to fill the co-occurrence matrix
# Iterate over sentences to fill the co-occurrence matrix


# Iterate over sentences to fill the co-occurrence matrix
for sentence in sentences:
    for center_word, context_word in combinations(sentence, 2):
        try:
            center_word_index = int(center_word)  # Try to convert to integer
            context_word_index = int(context_word)  # Try to convert to integer
        except ValueError:
            # If conversion to integer fails, continue to the next iteration
            continue

        # Now, you can use center_word_index and context_word_index in your logic
        co_occurrence_matrix[center_word_index, context_word_index] += 1
        co_occurrence_matrix[context_word_index, center_word_index] += 1





# Define a simple GloVe model
class GloveModel:
    def __init__(self, vocab_size, vector_size=100, learning_rate=0.05):
        self.W = np.random.rand(vocab_size, vector_size).astype(np.float32)
        self.b = np.random.rand(vocab_size).astype(np.float32)
        self.W_grad = np.zeros_like(self.W)
        self.b_grad = np.zeros_like(self.b)
        self.vector_size = vector_size
        self.learning_rate = learning_rate

    def train_step(self, target_word_index, context_word_index, co_occurrence_count):
        # Calculate predicted co-occurrence count
        prediction = np.dot(self.W[target_word_index], self.W[context_word_index]) + self.b[target_word_index] + self.b[context_word_index]
        diff = prediction - np.log(co_occurrence_count)

        # Update gradients
        self.W_grad[target_word_index] += diff * self.W[context_word_index]
        self.W_grad[context_word_index] += diff * self.W[target_word_index]
        self.b_grad[target_word_index] += diff
        self.b_grad[context_word_index] += diff

    def update_params(self):
        self.W -= self.learning_rate * self.W_grad
        self.b -= self.learning_rate * self.b_grad
        self.W_grad.fill(0)
        self.b_grad.fill(0)

# Train the GloVe model using the co-occurrence matrix
glove_model = GloveModel(vocab_size, vector_size=100, learning_rate=0.05)
num_epochs = 10

for epoch in range(num_epochs):
    for i in range(vocab_size):
        for j in range(vocab_size):
            if co_occurrence_matrix[i, j] > 0:
                glove_model.train_step(i, j, co_occurrence_matrix[i, j])

    glove_model.update_params()

# Save the trained word embeddings
word_embeddings = glove_model.W

# Optionally, you can use the trained word embeddings for downstream tasks
# For example, you can use gensim's Word2Vec wrapper to load the embeddings
gensim_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
gensim_model.wv.vectors = word_embeddings
gensim_model.save('glove_model')

  self.W_grad[target_word_index] += diff * self.W[context_word_index]
  self.W_grad[context_word_index] += diff * self.W[target_word_index]
  self.W -= self.learning_rate * self.W_grad


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import numpy as np

# Load the saved GloVe model
glove_model = Word2Vec.load('glove_model')

# Convert list of strings to a list of lists (as Tokenizer expects a list of strings)
X = [[str(word) for word in str(sentence).split()] for sentence in X]

# Tokenize the text and convert it into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences to have consistent length
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Convert string labels to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Convert numerical labels to one-hot encoding
labels_one_hot = to_categorical(y_encoded)

# Create LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_sequence_length, weights=[glove_model.wv.vectors], trainable=False))
model.add(LSTM(100))
model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels_one_hot, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: nan, Test Accuracy: 1.0


In [None]:
##PRETRAINED MODEL

from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow as tf
import torch
import pandas as pd



file_path = r'hindi.csv'
df = pd.read_csv(file_path)


# Tokenize input text
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
model = AutoModel.from_pretrained('ai4bharat/indic-bert')

X = df['text'].tolist()
y = df['label'].tolist()

max_length = 128  # Adjust as needed
tokenized_input = tokenizer(X, return_tensors='pt', padding=True, truncation=True, max_length=max_length)

with torch.no_grad():
    word_embeddings = model(**tokenized_input).last_hidden_state

# Convert PyTorch tensor to NumPy array
word_embeddings_np = word_embeddings.numpy()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(word_embeddings_np, y, test_size=0.3, random_state=42, stratify=y)

y_train = np.array(y_train)

y_test = np.array(y_test)
# One-hot encode the labels
encoder = OneHotEncoder(sparse=False, categories='auto')
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

# Build an LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(100))
lstm_model.add(Dense(3, activation='softmax'))  # Assuming you have 3 classes: positive, negative, neutral

# Compile the model
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the LSTM model
lstm_model.fit(X_train, y_train_onehot, epochs=10, validation_data=(X_test, y_test_onehot))

# Evaluate on test data
accuracy = lstm_model.evaluate(X_test, y_test_onehot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 75.17%


In [None]:
##PRETRAINED MODEL

from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow as tf
import torch
import pandas as pd



file_path = r'hindi.csv'
df = pd.read_csv(file_path)


# Tokenize input text
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
model = AutoModel.from_pretrained('ai4bharat/indic-bert')

X = df['text'].tolist()
y = df['label'].tolist()

max_length = 128  # Adjust as needed
tokenized_input = tokenizer(X, return_tensors='pt', padding=True, truncation=True, max_length=max_length)

with torch.no_grad():
    word_embeddings = model(**tokenized_input).last_hidden_state

# Convert PyTorch tensor to NumPy array
word_embeddings_np = word_embeddings.numpy()
vector_size = word_embeddings_np.shape[-1]
print(f"Vector size of indic-bert model: {vector_size}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(word_embeddings_np, y, test_size=0.3, random_state=42, stratify=y)

y_train = np.array(y_train)

y_test = np.array(y_test)
# One-hot encode the labels
encoder = OneHotEncoder(sparse=False, categories='auto')
y_train_onehot = encoder.fit_transform(y_train.reshape(-1, 1))
y_test_onehot = encoder.transform(y_test.reshape(-1, 1))

# Build an LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(100))
lstm_model.add(Dense(3, activation='softmax'))  # Assuming you have 3 classes: positive, negative, neutral

# Compile the model
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the LSTM model
lstm_model.fit(X_train, y_train_onehot, epochs=20, validation_data=(X_test, y_test_onehot))

# Evaluate on test data
accuracy = lstm_model.evaluate(X_test, y_test_onehot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Vector size of indic-bert model: 768




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 73.61%
