In [None]:
#Question 2:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

df = pd.read_csv('imdb_dataset.csv')

train_data, test_data, train_labels, test_labels = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42)

# b) Perform necessary pre-text processing for vocabulary generation

# c) Represent the text using TF-IDF feature weighting
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
train_tfidf = tfidf_vectorizer.fit_transform(train_data)
test_tfidf = tfidf_vectorizer.transform(test_data)

# d) Show the IDF scores of the first 50 keywords in the vocabulary
feature_names = tfidf_vectorizer.get_feature_names()
idf_scores = dict(zip(feature_names, tfidf_vectorizer.idf_))

# Display IDF scores of the first 50 keywords
for keyword, idf_score in list(idf_scores.items())[:50]:
    print(f"{keyword}: {idf_score}")

# e) Perform K-NN classification
k_values = list(range(1, 10))
scores = []

for k in k_values:
    # Initialize and fit K-NN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(train_tfidf, train_labels)

    # Predict and evaluate accuracy
    predictions = knn_classifier.predict(test_tfidf)
    accuracy = accuracy_score(test_labels, predictions)
    scores.append(accuracy)

# Plot KNN testing accuracy for different values of K
plt.plot(k_values, scores, marker='o')
plt.xlabel('K Values')
plt.ylabel('Testing Accuracy')
plt.title('K-NN Classification Accuracy')
plt.show()

# f) Train the dataset using Random Forest and Logistic Regression
# Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(train_tfidf, train_labels)

# Logistic Regression
lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(train_tfidf, train_labels)

# g) Compare the performances of different classifiers and show the confusion matrices
# K-NN
knn_predictions = knn_classifier.predict(test_tfidf)
print("K-NN Confusion Matrix:")
print(confusion_matrix(test_labels, knn_predictions))

# Random Forest
rf_predictions = rf_classifier.predict(test_tfidf)
print("\nRandom Forest Confusion Matrix:")
print(confusion_matrix(test_labels, rf_predictions))

# Logistic Regression
lr_predictions = lr_classifier.predict(test_tfidf)
print("\nLogistic Regression Confusion Matrix:")
print(confusion_matrix(test_labels, lr_predictions))
