In [19]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifier
from sklearn.dummy import DummyClassifier
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split , GridSearchCV
import pandas as pd

In [20]:
train = pd.read_csv("/content/C_TRAIN.csv")
test = pd.read_csv("/content/C_TEST.csv")

In [21]:
x = train['Content']
y = train['Label']
xx_test = test["Content"]

In [22]:
xx_test.isnull().sum()

0

In [23]:
xx_test.fillna('', inplace=True)

In [24]:
xx_test.isnull().sum()

0

In [25]:
tfidf_vectorizer = TfidfVectorizer()

In [26]:
x_tfidf = tfidf_vectorizer.fit_transform(x)
x_test_tfidf = tfidf_vectorizer.transform(xx_test)

In [27]:
# Define the models
models = {
    'Naive Bayes': MultinomialNB(),
    'Ridge Classifier': RidgeClassifier(),
    'Dummy Classifier': DummyClassifier()
}

In [28]:
# Define the hyperparameter tuning space for each model
param_grids = {
    'Naive Bayes': {'alpha': [0.1, 1, 10]},
    'Ridge Classifier': {'alpha': [0.1, 1, 10]},
    'Dummy Classifier': {}
}

In [29]:
# Perform hyperparameter tuning and training for each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='accuracy')
    grid_search.fit(x_train, y_train)
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Score: {grid_search.best_score_:.4f}")
    print()

Training Naive Bayes...
Best Parameters: {'alpha': 0.1}
Best Score: 0.5779

Training Ridge Classifier...
Best Parameters: {'alpha': 1}
Best Score: 0.6305

Training Dummy Classifier...
Best Parameters: {}
Best Score: 0.2054



In [30]:
 #Create a voting classifier with the tuned models
 estimators = [('nb', MultinomialNB(alpha=0.1)), ('rc', RidgeClassifier(alpha=1)), ('dc', DummyClassifier())]

In [31]:
voting_model = VotingClassifier(estimators=estimators, voting='hard')

In [32]:
voting_model.fit(x_train, y_train)

In [33]:
y_pred = voting_model.predict(x_test)

In [34]:
# Evaluate the stacking classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of stacking classifier: {accuracy:.4f}")

Accuracy of stacking classifier: 0.6048


In [35]:
y_pred = voting_model.predict(x_test_tfidf)

In [36]:

# Create a submission file
num_predictions_needed = 21000
y_test_pred_padded = np.pad(y_pred, (0, num_predictions_needed - len(y_pred)), 'constant', constant_values='unknown')
submission = pd.DataFrame({"ID": range(num_predictions_needed), "Label": y_test_pred_padded})

In [None]:

#print(test.shape)


In [37]:
print(submission.shape)

(21000, 2)


In [38]:
submission.head()

Unnamed: 0,ID,Label
0,0,B
1,1,B
2,2,B
3,3,H
4,4,H


In [39]:
submission.to_csv("submission3.csv", index=False)