<a href="https://colab.research.google.com/github/ParthBindra/ML-Project/blob/main/MLA_PROJ_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#SVM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your preprocessed dataset (assuming it's in a pandas DataFrame)
# Replace 'data.csv' with the actual path to your dataset
data = pd.read_csv('/content/ML_PROJ.csv')

# Split the data into features (X) and labels (y)
X = data['ovc']  # Assuming 'sentence' is the column name for Hindi text
y = data['Label']     # Assuming 'label' is the column name for sentiment labels

# Split the data into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the SVM classifier on the training data
svm_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the testing data
y_pred = svm_classifier.predict(X_test_tfidf)

# Evaluate the performance of the SVM classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')


Accuracy: 0.59
Precision: 0.61
Recall: 0.59
F1-score: 0.59


In [None]:
#Naive Bayes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your preprocessed dataset (assuming it's in a pandas DataFrame)
# Replace 'data.csv' with the actual path to your dataset
data = pd.read_csv('/content/ML_PROJ.csv')

# Split the data into features (X) and labels (y)
X = data['ovc']  # Assuming 'sentence' is the column name for Hindi text
y = data['Label']     # Assuming 'label' is the column name for sentiment labels

# Split the data into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the Naive Bayes classifier on the training data
nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the testing data
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the performance of the Naive Bayes classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')


Accuracy: 0.63
Precision: 0.67
Recall: 0.63
F1-score: 0.61


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load your preprocessed dataset (assuming it's in a pandas DataFrame)
# Replace 'data.csv' with the actual path to your dataset
data = pd.read_csv('/content/ML_PROJ.csv')

# Split the data into features (X) and labels (y)
X = data['ovc']  # Assuming 'sentence' is the column name for Hindi text
y = data['Label']     # Assuming 'label' is the column name for sentiment labels

# Encode the labels into numerical format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data and convert it into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length (adjust maxlen as needed)
maxlen = 100  # Maximum length of sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# Build the ANN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=maxlen))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Multiclass output layer

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model on the training data
model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Make predictions on the testing data
y_pred_prob = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_prob, axis=1)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.68
Precision: 0.68
Recall: 0.68
F1-score: 0.68


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your preprocessed dataset (assuming it's in a pandas DataFrame)
# Replace 'data.csv' with the actual path to your dataset
data = pd.read_csv('/content/ML_PROJ.csv')

# Split the data into features (X) and labels (y)
X = data['ovc']  # Assuming 'sentence' is the column name for Hindi text
y = data['Label']     # Assuming 'label' is the column name for sentiment labels

# Encode the labels into numerical format
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenize the text data and convert it into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length (adjust maxlen as needed)
maxlen = 100  # Maximum length of sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=maxlen))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Multiclass output layer

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model on the training data
model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Make predictions on the testing data
y_pred_prob = model.predict(X_test_pad)
y_pred = y_pred_prob.argmax(axis=-1)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.47
Precision: 0.22
Recall: 0.47
F1-score: 0.30


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#LOGISTIC
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load your preprocessed dataset (assuming it's in a pandas DataFrame)
# Replace 'data.csv' with the actual path to your dataset
data = pd.read_csv('/content/ML_PROJ.csv')

# Split the data into features (X) and labels (y)
X = data['ovc']  # Assuming 'sentence' is the column name for Hindi text
y = data['Label']     # Assuming 'label' is the column name for sentiment labels

# Split the data into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize Logistic Regression classifier
logreg_classifier = LogisticRegression(max_iter=1000, random_state=42)

# Train the Logistic Regression classifier on the training data
logreg_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the testing data
y_pred = logreg_classifier.predict(X_test_tfidf)

# Evaluate the performance of the Logistic Regression classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')


Accuracy: 0.60
Precision: 0.62
Recall: 0.60
F1-score: 0.59


In [None]:
#Random Forest
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load your preprocessed dataset (assuming it's in a pandas DataFrame)
# Replace 'data.csv' with the actual path to your dataset
data = pd.read_csv('/content/ML_PROJ.csv')

# Split the data into features (X) and labels (y)
X = data['ovc']  # Assuming 'sentence' is the column name for Hindi text
y = data['Label']     # Assuming 'label' is the column name for sentiment labels

# Split the data into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TF-IDF vectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier on the training data
rf_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test_tfidf)

# Evaluate the performance of the Random Forest classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

Accuracy: 0.68
Precision: 0.68
Recall: 0.68
F1-score: 0.68


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load your preprocessed dataset (assuming it's in a pandas DataFrame)
# Replace 'data.csv' with the actual path to your dataset
data = pd.read_csv('/content/ML_PROJ.csv')

# Split the data into features (X) and labels (y)
X = data['ovc']  # Assuming 'sentence' is the column name for Hindi text
y = data['Label']     # Assuming 'label' is the column name for sentiment labels

# Split the data into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize base classifiers (Naive Bayes, SVM, Logistic Regression)
nb_classifier = MultinomialNB()
svm_classifier = SVC(kernel='linear', probability=True, random_state=42)
logreg_classifier = LogisticRegression(max_iter=1000, random_state=42)

# Initialize Bagging classifiers for each base model
nb_bagging = BaggingClassifier(base_estimator=nb_classifier, n_estimators=10, random_state=42)
svm_bagging = BaggingClassifier(base_estimator=svm_classifier, n_estimators=10, random_state=42)
logreg_bagging = BaggingClassifier(base_estimator=logreg_classifier, n_estimators=10, random_state=42)

# Fit the Bagging classifiers on the training data
nb_bagging.fit(X_train_tfidf, y_train)
svm_bagging.fit(X_train_tfidf, y_train)
logreg_bagging.fit(X_train_tfidf, y_train)

# Make predictions using the Bagging classifiers
y_pred_nb = nb_bagging.predict(X_test_tfidf)
y_pred_svm = svm_bagging.predict(X_test_tfidf)
y_pred_logreg = logreg_bagging.predict(X_test_tfidf)

# Evaluate the performance of Bagging classifiers
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')

accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test, y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')

accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
precision_logreg = precision_score(y_test, y_pred_logreg, average='weighted')
recall_logreg = recall_score(y_test, y_pred_logreg, average='weighted')
f1_logreg = f1_score(y_test, y_pred_logreg, average='weighted')

# Print the evaluation metrics for each Bagging classifier
print('Naive Bayes Bagging:')
print(f'Accuracy: {accuracy_nb:.2f}')
print(f'Precision: {precision_nb:.2f}')
print(f'Recall: {recall_nb:.2f}')
print(f'F1-score: {f1_nb:.2f}')
print()

print('SVM Bagging:')
print(f'Accuracy: {accuracy_svm:.2f}')
print(f'Precision: {precision_svm:.2f}')
print(f'Recall: {recall_svm:.2f}')
print(f'F1-score: {f1_svm:.2f}')
print()

print('Logistic Regression Bagging:')
print(f'Accuracy: {accuracy_logreg:.2f}')
print(f'Precision: {precision_logreg:.2f}')
print(f'Recall: {recall_logreg:.2f}')
print(f'F1-score: {f1_logreg:.2f}')




Naive Bayes Bagging:
Accuracy: 0.60
Precision: 0.64
Recall: 0.60
F1-score: 0.58

SVM Bagging:
Accuracy: 0.65
Precision: 0.67
Recall: 0.65
F1-score: 0.64

Logistic Regression Bagging:
Accuracy: 0.62
Precision: 0.64
Recall: 0.62
F1-score: 0.62


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load your preprocessed dataset (assuming it's in a pandas DataFrame)
# Replace 'data.csv' with the actual path to your dataset
data = pd.read_csv('/content/ML_PROJ.csv')

# Split the data into features (X) and labels (y)
X = data['ovc']  # Assuming 'sentence' is the column name for Hindi text
y = data['Label']     # Assuming 'label' is the column name for sentiment labels

# Split the data into training and testing sets (adjust test_size as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize base classifiers (Naive Bayes, SVM, Logistic Regression)
nb_classifier = MultinomialNB()
svm_classifier = SVC(kernel='linear', probability=True, random_state=42)
logreg_classifier = LogisticRegression(max_iter=1000, random_state=42)

# Initialize Boosting classifiers for each base model (AdaBoost, Gradient Boosting)
nb_boosting = AdaBoostClassifier(base_estimator=nb_classifier, n_estimators=50, random_state=42)
svm_boosting = AdaBoostClassifier(base_estimator=svm_classifier, n_estimators=50, random_state=42)
logreg_boosting = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Fit the Boosting classifiers on the training data
nb_boosting.fit(X_train_tfidf, y_train)
svm_boosting.fit(X_train_tfidf, y_train)
logreg_boosting.fit(X_train_tfidf, y_train)

# Make predictions using the Boosting classifiers
y_pred_nb = nb_boosting.predict(X_test_tfidf)
y_pred_svm = svm_boosting.predict(X_test_tfidf)
y_pred_logreg = logreg_boosting.predict(X_test_tfidf)

# Evaluate the performance of Boosting classifiers
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')

accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test, y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')

accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
precision_logreg = precision_score(y_test, y_pred_logreg, average='weighted')
recall_logreg = recall_score(y_test, y_pred_logreg, average='weighted')
f1_logreg = f1_score(y_test, y_pred_logreg, average='weighted')

# Print the evaluation metrics for each Boosting classifier
print('Naive Bayes Boosting:')
print(f'Accuracy: {accuracy_nb:.2f}')
print(f'Precision: {precision_nb:.2f}')
print(f'Recall: {recall_nb:.2f}')
print(f'F1-score: {f1_nb:.2f}')
print()

print('SVM Boosting:')
print(f'Accuracy: {accuracy_svm:.2f}')
print(f'Precision: {precision_svm:.2f}')
print(f'Recall: {recall_svm:.2f}')
print(f'F1-score: {f1_svm:.2f}')
print()

print('Logistic Regression Boosting:')
print(f'Accuracy: {accuracy_logreg:.2f}')
print(f'Precision: {precision_logreg:.2f}')
print(f'Recall: {recall_logreg:.2f}')
print(f'F1-score: {f1_logreg:.2f}')
