In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split , GridSearchCV
import pandas as pd

In [23]:
train = pd.read_csv("/content/Clean_train.csv")
test = pd.read_csv("/content/Clean_test.csv")

In [24]:
x = train['text']
y = train['target']
xx_test = test["text"]

In [25]:
xx_test.isnull().sum()

1

In [26]:
xx_test.fillna('', inplace=True)

In [27]:
xx_test.isnull().sum()

0

In [28]:
tfidf_vectorizer = TfidfVectorizer()

In [29]:
x_tfidf = tfidf_vectorizer.fit_transform(x)
x_test_tfidf = tfidf_vectorizer.transform(xx_test)

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42 ,
                                                    shuffle = True)

In [47]:
# Define the models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'K-Neighbors Classifier': KNeighborsClassifier()
}

In [48]:
# Define the hyperparameter tuning space for each model
param_grids = {
    'Naive Bayes': {'alpha': [0.1, 1, 10]},
    'Logistic Regression': {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga']},
    'K-Neighbors Classifier': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
}

In [49]:
# Perform hyperparameter tuning and training for each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='accuracy')
    grid_search.fit(x_train, y_train)
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Score: {grid_search.best_score_:.4f}")
    print()

Training Naive Bayes...
Best Parameters: {'alpha': 1}
Best Score: 0.7998

Training Logistic Regression...




Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
Best Score: 0.7913

Training K-Neighbors Classifier...
Best Parameters: {'n_neighbors': 7, 'weights': 'distance'}
Best Score: 0.7788



In [52]:
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

In [53]:
y_pred = best_estimator.predict(x_test)

In [54]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy from top 3 models after hyperparameter tuning:", accuracy)
print("Best Parameters:", best_params)

Accuracy from top 3 models after hyperparameter tuning: 0.7721602101116218
Best Parameters: {'n_neighbors': 7, 'weights': 'distance'}


In [55]:
y_pred = best_estimator.predict(x_test_tfidf)

submission9 = pd.DataFrame({
    "id": test["id"],
    "target": y_pred
})

In [46]:
y_pred = best_estimator.predict(x_test)

In [17]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of logistic regression after hyperparameter tuning:", accuracy)
print("Best Parameters:", best_params)

Accuracy of logistic regression after hyperparameter tuning: 0.7905449770190414
Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}


In [20]:
y_pred_1 = best_estimator.predict(x_test_tfidf)

submission8 = pd.DataFrame({
    "id": test["id"],
    "target": y_pred_1
})

In [56]:
submission9.to_csv("Submission9.csv", index=False)