In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.ensemble import StackingClassifier
import pandas as pd

In [None]:
train = pd.read_csv("/content/Clean_train.csv")
test = pd.read_csv("/content/Clean_test.csv")

In [None]:
x = train['text']
y = train['target']
xx_test = test["text"]

In [None]:
xx_test.isnull().sum()

1

In [None]:
xx_test.fillna('', inplace=True)

In [None]:
xx_test.isnull().sum()

0

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
x_tfidf = tfidf_vectorizer.fit_transform(x)
x_test_tfidf = tfidf_vectorizer.transform(xx_test)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42 ,
                                                    shuffle = True)

In [None]:
# Define the models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'K-Neighbors Classifier': KNeighborsClassifier()
}

In [None]:
# Define the hyperparameter tuning space for each model
param_grids = {
    'Naive Bayes': {'alpha': [0.1, 1, 10]},
    'Logistic Regression': {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga']},
    'K-Neighbors Classifier': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
}

In [None]:
# Perform hyperparameter tuning and training for each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='accuracy')
    grid_search.fit(x_train, y_train)
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Score: {grid_search.best_score_:.4f}")
    print()

Training Naive Bayes...
Best Parameters: {'alpha': 1}
Best Score: 0.7998

Training Logistic Regression...




Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
Best Score: 0.7913

Training K-Neighbors Classifier...
Best Parameters: {'n_neighbors': 7, 'weights': 'distance'}
Best Score: 0.7788



In [None]:
# Create a stacking classifier with the tuned models
estimators = [
    ('Naive Bayes', models['Naive Bayes']),
    ('Logistic Regression', models['Logistic Regression']),
    ('K-Neighbors Classifier', models['K-Neighbors Classifier'])
]

In [None]:
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

In [None]:
stacking_clf.fit(x_train, y_train)

In [None]:
y_pred = stacking_clf.predict(x_test)

In [None]:
# Evaluate the stacking classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of stacking classifier: {accuracy:.4f}")

Accuracy of stacking classifier: 0.7958


In [None]:
y_pred_test = stacking_clf.predict(x_test_tfidf)

submission10= pd.DataFrame({
    "id": test["id"],
    "target": y_pred_test
})

In [None]:
submission10.to_csv("Submission10.csv", index=False)