# **Task 1**

### **Notations, Data Initialization** 
**n** →number of features

**m** →number of training examples

**X** →input data matrix of shape (m x n)

**y** →true/target value (can be 0 or 1 only)

**x(i), y(i)** →ith training example

**w** → weights (parameters) of shape (n x 1)

**b** →bias (parameter), a real number that can be broadcasted.

**y_hat**(y with a cap/hat)→ hypothesis (outputs values between 0 and 1)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Load and preprocess data
X_df = pd.read_csv('train_tfidf_features.csv') #train.csv already turned into tfidf features (including id and label)
y_df = pd.read_csv('train.csv')['label'] #Extract labels from train.csv

# Remove 'id' and 'label' columns from X_df
X_df = X_df.drop(['id', 'label'], axis=1)

X = X_df.values
X_scaled = scaler.fit_transform(X) #Scale X
y = y_df.values.reshape(-1, 1) #Reshape y to 2D array
y2 = y_df.values #y2 is 1D array


ModuleNotFoundError: No module named 'pandas'

In [None]:
X_scaled.shape, y2.shape

## **Training Functions for Logistic Regression**

In [None]:
import numpy as np

class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=500, reg_type='l2', lambda_param=0.1, l1_ratio=0.5):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.reg_type = reg_type
        self.lambda_param = lambda_param
        self.l1_ratio = l1_ratio
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def loss(self, y, y_predicted):
        return -np.mean(y * np.log(y_predicted + 1e-15) + (1 - y) * np.log(1 - y_predicted + 1e-15))
    
    def gradients(self, X, y, y_predicted):
        num_samples = X.shape[0]
        dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))
        db = (1 / num_samples) * np.sum(y_predicted - y)
        return dw, db
    
    def relu(self, z):
        return np.maximum(0, z)

    def initialize_parameters(self, num_features):
        self.weights = np.zeros((num_features,1)) #13747,1
        self.bias = 0

    def regularization(self):
        if self.reg_type == 'l1': #L1 regularization
            return self.lambda_param * np.sum(np.abs(self.weights))
        elif self.reg_type == 'l2': #L2 regularization
            return 0.5 * self.lambda_param * np.sum(self.weights**2)
        elif self.reg_type == 'elastic_net': #Elastic Net regularization
            l1_term = self.l1_ratio * np.sum(np.abs(self.weights))
            l2_term = (1 - self.l1_ratio) * 0.5 * np.sum(self.weights**2)
            return self.lambda_param * (l1_term + l2_term)
        else:
            return 0

    def regularization_gradient(self):
        if self.reg_type == 'l1':
            return self.lambda_param * np.sign(self.weights)
        elif self.reg_type == 'l2':
            return self.lambda_param * self.weights
        elif self.reg_type == 'elastic_net':
            l1_grad = self.l1_ratio * np.sign(self.weights)
            l2_grad = (1 - self.l1_ratio) * self.weights
            return self.lambda_param * (l1_grad + l2_grad)
        else:
            return np.zeros_like(self.weights)

    def train(self, X, y, tolerance=1e-4, max_unchanged_iterations=5):
        num_samples, num_features = X.shape
        self.initialize_parameters(num_features)

        prev_loss = float('inf')
        unchanged_iterations = 0
        batch_size = 32

        for iteration in range(self.num_iterations):
            total_loss = 0
            # Shuffle the data
            indices = np.random.permutation(num_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            for i in range(0, num_samples, batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]

                # Forward pass
                linear_model = np.dot(X_batch, self.weights) + self.bias
                y_predicted = self.sigmoid(linear_model)

                # Compute loss
                loss = self.loss(y_batch, y_predicted)
                loss += self.regularization()
                total_loss += loss

                # Compute gradients
                dw, db = self.gradients(X_batch, y_batch, y_predicted)

                # Update parameters
                self.weights -= self.learning_rate * dw
                self.bias -= self.learning_rate * db

            # Calculate average loss for this iteration
            avg_loss = total_loss / (num_samples / batch_size)

            # Check for convergence
            if abs(prev_loss - avg_loss) < tolerance:
                unchanged_iterations += 1
                if unchanged_iterations >= max_unchanged_iterations:
                    print(f"Converged after {iteration + 1} iterations.")
                    break
            else:
                unchanged_iterations = 0

            prev_loss = avg_loss

            if (iteration + 1) % 100 == 0:
                print(f"Iteration {iteration + 1}/{self.num_iterations}, Loss: {avg_loss}")

        if iteration == self.num_iterations - 1:
            print(f"Reached maximum iterations ({self.num_iterations}).")

    def predict_prob(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_model)

    def predict(self, X, threshold=0.5):
        return (self.predict_prob(X) >= threshold).astype(int)



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


# Step 2: Split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

# Step 3: Define the LogisticRegression class (as provided in the previous response)
# ... (insert the LogisticRegression class definition here)

# Step 4: Define a function to train and evaluate the model
def train_and_evaluate(X_train, y_train, X_val, y_val, **kwargs):
    model = LogisticRegression(**kwargs)
    model.train(X_train, y_train)
    y_pred = model.predict(X_val)
    y_pred_prob = model.predict_prob(X_val)
    return accuracy_score(y_val, y_pred), model

# Step 5: Hyperparameter tuning
num_iterations = 1000
regularization_types = ['l2'] #, 'elastic_net', 'l1'
lambda_params = [0.0001, 0.001, 0.01, 0.1, 1, 10]  # Regularization parameter
learning_rates = [1, 0.1, 0.01, 0.001, 0.0001] # Learning rate
l1_ratios = [0.2, 0.5, 0.8]  # Only for elastic net

best_accuracy = 0
best_params = {}
for learning_rate in learning_rates:
    for reg_type in regularization_types:
        for lambda_param in lambda_params:
            # if reg_type == 'elastic_net':
            #     for l1_ratio in l1_ratios:
            #         accuracy, model = train_and_evaluate(
            #             X_train, y_train, X_val, y_val,
            #             learning_rate=learning_rate, reg_type=reg_type, lambda_param=lambda_param, l1_ratio=l1_ratio
            #         )
            #         print(f"AUC: {accuracy}, Params: {reg_type}, {lambda_param}")
            #         if accuracy > best_accuracy:
            #             best_accuracy = accuracy
            #             best_params = {'learning_rate':learning_rate, 'reg_type': reg_type, 'lambda_param': lambda_param, 'l1_ratio': l1_ratio}
            # else:
                accuracy, model = train_and_evaluate(
                    X_train, y_train, X_val, y_val,
                    learning_rate=learning_rate, reg_type=reg_type, lambda_param=lambda_param
                )
                print(f"Accuracy: {accuracy}, Params: lr-{learning_rate}, rt-{reg_type}, lp-{lambda_param}")
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = {'learning_rate': learning_rate,'reg_type': reg_type, 'lambda_param': lambda_param}
                

print(f"Best validation accuracy: {best_accuracy}")
print(f"Best parameters: {best_params}")

# Step 6: Train the final model with best parameters
final_model = LogisticRegression(**best_params)
final_model.train(np.vstack((X_train, X_val)), np.concatenate((y_train, y_val)))


In [None]:
final_model = LogisticRegression(**best_params)
final_model.train(np.vstack((X_train, X_val)), np.concatenate((y_train, y_val)))
# Step 7: Evaluate on the test set
X_test_df = pd.read_csv('test_tfidf_features.csv') #train.csv already turned into tfidf features (including id and label)
# Remove 'id' columns from X_df
X_test_df = X_test_df.drop(['id'], axis=1)
X_test = X_test_df.values
test_predictions = final_model.predict(X_test)

In [None]:
print(best_params)

In [None]:
from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression

# Step 7: Train and predict using sklearn's LogisticRegression
if best_params['reg_type'] == 'l1':
    sklearn_model = SklearnLogisticRegression(penalty='l1', C=1/best_params['lambda_param'], solver='liblinear')
elif best_params['reg_type'] == 'l2':
    sklearn_model = SklearnLogisticRegression(penalty='l2', C=1/best_params['lambda_param'])
else:  # elastic_net
    sklearn_model = SklearnLogisticRegression(penalty='elasticnet', C=1/best_params['lambda_param'], 
                                              l1_ratio=best_params['l1_ratio'], solver='saga')

sklearn_model.fit(X_train, y_train.reshape(-1))
sklearn_test_predictions = sklearn_model.predict(X_test)
test_predictions = test_predictions.reshape(-1)
print(test_predictions.shape, sklearn_test_predictions.shape)

# Step 8: Compare predictions

agreement = np.mean(test_predictions == sklearn_test_predictions)
print(f"\nAgreement between custom model and sklearn model on test set: {agreement:.4f}")


In [None]:
# Step 9: Accuracy report for training set
train_predictions = final_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
sklearn_accuracy = accuracy_score(y_train, sklearn_model.predict(X_train))
print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Training Accuracy (Sklearn): {sklearn_accuracy:.4f}")
print("\nClassification Report (Training Set):")
print(classification_report(y_train, train_predictions))
print("\nConfusion Matrix (Training Set):")
print(confusion_matrix(y_train, train_predictions))

In [None]:
# Step 10: Save test predictions to CSV
test_predictions_df = pd.DataFrame(test_predictions, columns=['label'])
#sklearn_test_predictions_df = pd.DataFrame(sklearn_test_predictions, columns=['Predicted (Sklearn)'])
#Combine the two dataframes
#test_predictions_df = pd.concat([test_predictions_df, sklearn_test_predictions_df], axis=1)
#Add label column to the dataframe, copying the 'id' column from the test.csv file, placing it as the first column
test_predictions_df['id'] = pd.read_csv('test.csv')['id']
test_predictions_df = test_predictions_df[['id', 'label']]
test_predictions_df.to_csv('t1g3_test_predictions.csv', index=False)
print("\nTest predictions saved to 't1g3_test_predictions.csv'")

In [None]:
lambda_params = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]  # Updated lambda parameters
for lamb in lambda_params:
    alt_model = LogisticRegression(learning_rate=0.1, num_iterations=1000, reg_type='l2', lambda_param=lamb)
    alt_model.train(np.vstack((X_train, X_val)), np.concatenate((y_train, y_val)))
    alt_test_predictions = alt_model.predict(X_test)
    sklearn_model = SklearnLogisticRegression(penalty='l2', C=1/lamb)
    sklearn_model.fit(np.vstack((X_train, X_val)), np.concatenate((y_train, y_val)).reshape(-1))
    sklearn_test_predictions = sklearn_model.predict(X_test)
    agreement = np.mean(alt_test_predictions == sklearn_test_predictions)
    print(f"\nAgreement between custom model and sklearn model on test set (lambda={lamb}): {agreement:.4f}")
    print(f"Accuracy: {accuracy_score(y_val, alt_model.predict(X_val)):.4f}")
    print(f"Accuracy (Sklearn): {accuracy_score(y_val, sklearn_model.predict(X_val)):.4f}")
    

# Task 2

### with 2000 components

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Load data
df_train = pd.read_csv("C:\\Users\\user\\Downloads\\train_tfidf_features.csv.zip")
df_test = pd.read_csv("C:\\Users\\user\\Downloads\\test_tfidf_features.csv (1).zip")

# Split features and labels
X_train = df_train.drop(columns=['id', 'label'])
y_train = df_train['label']
X_test = df_test.drop(columns=['id'])

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define PCA components to test
n_components = 2000  # Change this to the desired number of components

# Apply PCA
pca = PCA(n_components=n_components, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_pca, y_train)

# Predict on test set
y_pred = knn.predict(X_test_pca)

# Create a DataFrame to save predictions
submission_df = pd.DataFrame({
    'id': df_test['id'],
    'label': y_pred
})

# Save predictions to CSV
submission_df.to_csv("knn_pca_predictions.csv", index=False)

print("Predictions saved to knn_pca_predictions.csv")


### with 1000 components

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Load data
df_train = pd.read_csv("C:\\Users\\user\\Downloads\\train_tfidf_features.csv.zip")
df_test = pd.read_csv("C:\\Users\\user\\Downloads\\test_tfidf_features.csv (1).zip")

# Split features and labels
X_train = df_train.drop(columns=['id', 'label'])
y_train = df_train['label']
X_test = df_test.drop(columns=['id'])

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define PCA components to test
n_components = 1000  # Change this to the desired number of components

# Apply PCA
pca = PCA(n_components=n_components, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_pca, y_train)

# Predict on test set
y_pred = knn.predict(X_test_pca)

# Create a DataFrame to save predictions
submission_df = pd.DataFrame({
    'id': df_test['id'],
    'label': y_pred
})

# Save predictions to CSV
submission_df.to_csv("knn_pca_predictions.csv", index=False)

print("Predictions saved to knn_pca_predictions.csv")


### with 500 components

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Load data
df_train = pd.read_csv("C:\\Users\\user\\Downloads\\train_tfidf_features.csv.zip")
df_test = pd.read_csv("C:\\Users\\user\\Downloads\\test_tfidf_features.csv (1).zip")

# Split features and labels
X_train = df_train.drop(columns=['id', 'label'])
y_train = df_train['label']
X_test = df_test.drop(columns=['id'])

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define PCA components to test
n_components = 500  # Change this to the desired number of components

# Apply PCA
pca = PCA(n_components=n_components, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_pca, y_train)

# Predict on test set
y_pred = knn.predict(X_test_pca)

# Create a DataFrame to save predictions
submission_df = pd.DataFrame({
    'id': df_test['id'],
    'label': y_pred
})

# Save predictions to CSV
submission_df.to_csv("knn_pca_predictions.csv", index=False)

print("Predictions saved to knn_pca_predictions.csv")


### with 100 components

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Load data
df_train = pd.read_csv("C:\\Users\\user\\Downloads\\train_tfidf_features.csv.zip")
df_test = pd.read_csv("C:\\Users\\user\\Downloads\\test_tfidf_features.csv (1).zip")

# Split features and labels
X_train = df_train.drop(columns=['id', 'label'])
y_train = df_train['label']
X_test = df_test.drop(columns=['id'])

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define PCA components to test
n_components = 100  # Change this to the desired number of components

# Apply PCA
pca = PCA(n_components=n_components, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_pca, y_train)

# Predict on test set
y_pred = knn.predict(X_test_pca)

# Create a DataFrame to save predictions
submission_df = pd.DataFrame({
    'id': df_test['id'],
    'label': y_pred
})

# Save predictions to CSV
submission_df.to_csv("knn_pca_predictions.csv", index=False)

print("Predictions saved to knn_pca_predictions.csv")



# Task 3


### Naive Bayes

In [None]:

#Task 3 Naive Bayes

#load dataset, separate into training and test datasets, features and labels
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

df_train = pd.read_csv("C:\\Users\\user\\Downloads\\train_tfidf_features.csv.zip")
df_test = pd.read_csv("C:\\Users\\user\\Downloads\\test_tfidf_features.csv (1).zip")
X_train=df_train.drop(columns=['id','label']) 
y_train=df_train['label']
X_test = df_test.drop(columns=['id'])

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)
#Prediction
y_pred = model.predict(X_test)
print(y_pred)
submission = pd.DataFrame({'id': df_test['id'], 'label': y_pred})
submission.to_csv('task3_version1_naivebayes.csv', index=False)


### Adaboost Classifier with hyperparameter tuning

In [None]:
#Task 3 Adaboost Classifier with hyperparameter tuning

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

# Load datasets
df_train = pd.read_csv("C:\\Users\\user\\Downloads\\train_tfidf_features.csv.zip")
df_test = pd.read_csv("C:\\Users\\user\\Downloads\\test_tfidf_features.csv (1).zip")

# Prepare features and labels
X_train = df_train.drop(columns=['id', 'label'])
y_train = df_train['label']
X_test = df_test.drop(columns=['id'])

# Split the data into training and validation sets
X_train_set, X_validate_set, y_train_set, y_validate_set = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

print(f"Training set shape: {X_train_set.shape}")
print(f"Validation set shape: {X_validate_set.shape}")

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.5, 1]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=AdaBoostClassifier(random_state=42), param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train_set, y_train_set)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best cross-validation accuracy: {best_score}")

# Evaluate the model with the best parameters on the validation set
best_model = grid_search.best_estimator_
validate_accuracy = best_model.score(X_validate_set, y_validate_set)

print(f"Validation Accuracy with best model: {validate_accuracy}")

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Create the submission file
submission = pd.DataFrame({'id': df_test['id'], 'label': y_pred})
submission.to_csv('task3_adaboostclassifier.csv', index=False)

print("Submission file created: task3_adaboostclassifier.csv")

### Ensemble Model with Voting Classifier 
### (Logistic Regression, XGBClassifier, RandomForestClassifier) 

In [None]:
### Task 3 Ensemble Model with Voting Classifier (Logistic Regression, XGBClassifier, RandomForestClassifier) 

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Loading train and test data sets in dataframe from CSV files
df_train = pd.read_csv("train_tfidf_features.csv")
df_test = pd.read_csv("test_tfidf_features.csv")

X_train = df_train.drop(columns=['id', 'label'])
y_train = (df_train['label'] >= 0.5).astype(int)
X_test = df_test.drop(columns=['id'])

# Splitting the training data into training and validation datasets
#X_train_set, X_validate_set, y_train_set, y_validate_set = train_test_split(X_train, y_train, test_size=0.35, random_state=42)

# Define parameter grids for each model
param_grid_lr = {'C': [0.1, 1, 10]}
param_grid_xgb = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1]}
param_grid_rf = {'n_estimators': [50, 100, 200]}

# Initialize grid search with stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid_search_lr = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid_lr, cv=cv, scoring='neg_log_loss', verbose=4)
grid_search_xgb = GridSearchCV(XGBClassifier(), param_grid_xgb, cv=cv, scoring='roc_auc', verbose=4)
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=cv, scoring='accuracy', verbose=4)

# Fit grid search
grid_search_lr.fit(X_train, y_train)
grid_search_xgb.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)

# Get the best models
best_lr = grid_search_lr.best_estimator_
best_xgb = grid_search_xgb.best_estimator_
best_rf = grid_search_rf.best_estimator_

# Print the best parameters for each model
print("Best parameters for Logistic Regression:", grid_search_lr.best_params_)
print("Best parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# Create a voting classifier with the best models
final_model = VotingClassifier(
    estimators=[
        ('lr', best_lr),
        ('xgb', best_xgb),
        ('rf', best_rf)
    ],
    voting='soft',
    verbose=True
)

# Train the final voting classifier on the training set
final_model.fit(X_train, y_train)

# Print the accuracy of the model on the training set
train_accuracy = final_model.score(X_train, y_train)

# Predict on the test set
y_test_pred = final_model.predict(X_test)

# Generate the submission file
submission = pd.DataFrame({'id': df_test['id'], 'label': y_test_pred})
submission.to_csv('voting_classifier.csv', index=False)

print("Submission files created!")

### Ensemble Model with Logistic Regression, XGBClassifier, RandomForestClassifier 
### (Static Classifier was used as the final classifier) 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from skopt import BayesSearchCV
import warnings

warnings.filterwarnings("ignore")

# Load datasets
df_train = pd.read_csv("C:\\Users\\user\\Downloads\\train_tfidf_features.csv.zip")
df_test = pd.read_csv("C:\\Users\\user\\Downloads\\test_tfidf_features.csv (1).zip")

# Prepare features and labels
X_train = df_train.drop(columns=['id', 'label'])
y_train = df_train['label']
X_test = df_test.drop(columns=['id'])

# Split the data into training and validation sets for hyperparameter tuning
X_train_set, X_validate_set, y_train_set, y_validate_set = train_test_split(
    X_train, y_train, test_size=0.3, random_state=42, stratify=y_train
)

print(f"Training set shape: {X_train_set.shape}")
print(f"Validation set shape: {X_validate_set.shape}")

# Define parameter grids for each model
param_dist_lr = {'lr__C': [0.01, 0.1, 1], 'lr__solver': ['lbfgs'], 'lr__max_iter': [100, 200]}
param_dist_xgb = {
    'n_estimators': [50, 100], 
    'learning_rate': [0.01, 0.1], 
    'max_depth': [3, 6], 
    'reg_lambda': [1, 10],
    'reg_alpha': [0, 1]
}
param_dist_rf = {'rf__n_estimators': [50, 100], 'rf__max_depth': [10, 20], 'rf__min_samples_split': [2, 5]}
param_dist_gb = {
    'gb__n_estimators': (50, 100), 
    'gb__learning_rate': (0.01, 0.1), 
    'gb__max_depth': (3, 6)
}

# Initialize the models
model_lr = LogisticRegression()
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_rf = RandomForestClassifier()
model_gb = GradientBoostingClassifier()

# Create pipelines for model training
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', model_lr)
])

pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', model_rf)
])

pipeline_gb = Pipeline([
    ('scaler', StandardScaler()),
    ('gb', model_gb)
])

# Initialize BayesianSearchCV for the Gradient Boosting model
bayes_search_gb = BayesSearchCV(
    estimator=pipeline_gb, 
    search_spaces=param_dist_gb, 
    n_iter=8, 
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), 
    scoring='accuracy', 
    n_jobs=1, 
    verbose=1
)

# Initialize RandomizedSearchCV for other models
random_search_lr = RandomizedSearchCV(
    estimator=pipeline_lr, param_distributions=param_dist_lr, n_iter=6, 
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), 
    scoring='accuracy', n_jobs=1, verbose=1
)
random_search_xgb = RandomizedSearchCV(
    estimator=model_xgb, param_distributions=param_dist_xgb, n_iter=10, 
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), 
    scoring='accuracy', n_jobs=1, verbose=1
)
random_search_rf = RandomizedSearchCV(
    estimator=pipeline_rf, param_distributions=param_dist_rf, n_iter=8, 
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), 
    scoring='accuracy', n_jobs=1, verbose=1
)

# Custom scoring function for XGBoost with early stopping
def xgb_scorer(estimator, X, y):
    estimator.fit(X, y, eval_set=[(X_validate_set, y_validate_set)], early_stopping_rounds=10, verbose=False)
    return accuracy_score(y, estimator.predict(X))

# Fit RandomizedSearchCV
print("Training Logistic Regression model...")
random_search_lr.fit(X_train_set, y_train_set)

print("Training XGBoost model...")
random_search_xgb.scoring = make_scorer(xgb_scorer, greater_is_better=True)
random_search_xgb.fit(X_train_set, y_train_set)

print("Training Random Forest model...")
random_search_rf.fit(X_train_set, y_train_set)

print("Training Gradient Boosting model with Bayesian optimization...")
bayes_search_gb.fit(X_train_set, y_train_set)

# Get the best models
best_lr = random_search_lr.best_estimator_
best_xgb = random_search_xgb.best_estimator_
best_rf = random_search_rf.best_estimator_
best_gb = bayes_search_gb.best_estimator_

print(f"Best parameters for Logistic Regression: {random_search_lr.best_params_}")
print(f"Best parameters for XGBoost: {random_search_xgb.best_params_}")
print(f"Best parameters for Random Forest: {random_search_rf.best_params_}")
print(f"Best parameters for Gradient Boosting: {bayes_search_gb.best_params_}")

# Create the Voting Classifier with the best models
voting_clf = VotingClassifier(
    estimators=[('lr', best_lr), ('xgb', best_xgb), ('rf', best_rf), ('gb', best_gb)],
    voting='soft'  # Using soft voting to leverage probability estimates
)

# Stack the models using a meta-classifier
stacked_clf = StackingClassifier(
    estimators=[('lr', best_lr), ('xgb', best_xgb), ('rf', best_rf), ('gb', best_gb)],
    final_estimator=LogisticRegression(C=0.1, max_iter=100)  # Increased regularization for meta-classifier
)

# Fit the Stacking Classifier on the entire training data
print("Training Stacking Classifier...")
stacked_clf.fit(X_train, y_train)

# Evaluate the Stacking Classifier
train_accuracy = stacked_clf.score(X_train_set, y_train_set)
val_accuracy = stacked_clf.score(X_validate_set, y_validate_set)

print(f"Stacking Classifier Training accuracy: {train_accuracy}")
print(f"Stacking Classifier Validation accuracy: {val_accuracy}")

# Make predictions on the test set
y_test_pred = stacked_clf.predict(X_test)

# Create the submission file
submission = pd.DataFrame({'id': df_test['id'], 'label': y_test_pred})
submission.to_csv('stacking_classifier_submission_v3.csv', index=False)

print("Submission file created: stacking_classifier_submission_v3.csv")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


<class 'ModuleNotFoundError'>: No module named 'skopt'