In [29]:
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifier
from sklearn.dummy import DummyClassifier
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

In [30]:
train = pd.read_csv("/content/C_TRAIN.csv")
test = pd.read_csv("/content/C_TEST.csv")

In [31]:
x = train['Content']
y = train['Label']
xx_test = test["Content"]

In [32]:
xx_test.isnull().sum()

0

In [33]:
xx_test.fillna('', inplace=True)

In [34]:
xx_test.isnull().sum()

0

In [35]:
tfidf_vectorizer = TfidfVectorizer()

In [36]:
x_tfidf = tfidf_vectorizer.fit_transform(x)
x_test_tfidf = tfidf_vectorizer.transform(xx_test)

In [37]:
nb_model = MultinomialNB()
rc_model= RidgeClassifier()
dc_model = DummyClassifier()

In [45]:
voting_model = VotingClassifier(
    estimators=[('nb', nb_model),('rc', rc_model), ('dc', dc_model)],
    voting='hard'  # Use can also use hard voting voting = 'hard'
)

In [46]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [47]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42 ,
                                                    shuffle = True)

In [48]:
# Perform k-fold cross-validation
accuracy_scores = []

for train_index, test_index in kf.split(x_train):
    x_train_fold, x_val_fold = x_train[train_index], x_train[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    voting_model.fit(x_train_fold, y_train_fold)

    y_pred_voting = voting_model.predict(x_val_fold)

    accuracy_fold = accuracy_score(y_val_fold, y_pred_voting)
    accuracy_scores.append(accuracy_fold)

In [49]:
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print("Mean Accuracy of Voting Classifier with 5-fold cross-validation:", mean_accuracy)

Mean Accuracy of Voting Classifier with 5-fold cross-validation: 0.5513501549358123


In [52]:
y_pred_voting = voting_model.predict(x_test)

In [53]:
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print("Accuracy of Voting Classifier with 5-fold cross-validation:", accuracy_voting)


Accuracy of Voting Classifier with 5-fold cross-validation: 0.5591854803010181


In [54]:
y_pred_voting = voting_model.predict(x_test_tfidf)

In [55]:

# Create a submission file
num_predictions_needed = 21000
y_test_pred_padded = np.pad(y_pred_voting, (0, num_predictions_needed - len(y_pred_voting)), 'constant', constant_values='unknown')
submission = pd.DataFrame({"ID": range(num_predictions_needed), "Label": y_test_pred_padded})

In [56]:

#print(test.shape)


In [57]:
print(submission.shape)

(21000, 2)


In [58]:
submission.head()

Unnamed: 0,ID,Label
0,0,B
1,1,B
2,2,B
3,3,H
4,4,H


In [59]:
submission.to_csv("submission2.csv", index=False)