In [2]:
%run '/kaggle/input/preparedata/prepare-ipynb.ipynb'

In [3]:
# # Import necessary libraries
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestClassifier  # Replace with your model

# def fit_model(train_data):
#     X_train = train_data['text']
#     y_train = train_data['spam']
#     # Fit a model on the train data
#     model = RandomForestClassifier()  # Replace with your model
#     model.fit(X_train, y_train)
#     return model

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


In [6]:
def score_model(model, data):
    # Score a model on given data
    predictions = model.predict(data['text'])
    score = accuracy_score(data['spam'], predictions)
    return score

In [7]:
def evaluate_model(model, train_data, validation_data):
    # Evaluate the model predictions on train and validation data
    train_score = score_model(model, train_data)
    validation_score = score_model(model, validation_data)

    print(f"Train Accuracy: {train_score}")
    print(f"Validation Accuracy: {validation_score}")

## Model 1: Random Forest Classifier

In [5]:

def fit_model(train_data):
    # Split the data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(
        train_data['text'], train_data['spam'], test_size=0.2, random_state=42
    )
    
    # Define the pipeline
    model = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Text processing using TF-IDF
        ('classifier', RandomForestClassifier(random_state=42))  # Random Forest Classifier
    ])
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    return model

In [20]:
def fine_tune_model(model, train_data, validation_data):
    # Fine-tuning the model using GridSearchCV 
    # GridSearchCV for RandomForestClassifier

    param_grid = {'classifier__n_estimators': [50, 100, 200],
                  'classifier__max_depth': [None, 10, 20, 30]}
    grid_search = GridSearchCV(model, param_grid, cv=3)
    grid_search.fit(train_data['text'], train_data['spam'])
    tuned_model = grid_search.best_estimator_

    # Evaluate the tuned model
    evaluate_model(tuned_model, train_data, validation_data)
    return tuned_model

In [11]:
train_data = pd.read_csv('train.csv')
validation_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

# Fit the model on train data
model = fit_model(train_data)

# Score and evaluate on train and validation data
evaluate_model(model, train_data, validation_data)

# Fine-tune the model if necessary
print("Fine tuned model accuracy is:")
tuned_model=fine_tune_model(model, train_data, validation_data)

Train Accuracy: 0.994543867306853
Validation Accuracy: 0.9738219895287958
Fine tuned model accuracy is:
Train Accuracy: 1.0
Validation Accuracy: 0.9720767888307156


## Model 2: Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

def fit_model2(train_data):
    # Split the data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(
        train_data['text'], train_data['spam'], test_size=0.2, random_state=42
    )
    
    # Define the pipeline with Logistic Regression Classifier
    model = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Text processing using TF-IDF
        ('classifier', LogisticRegression(random_state=42))  # Logistic Regression Classifier
    ])
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    return model


In [27]:
def fine_tune_model2(model, train_data, validation_data):
    # Fine-tune the model using GridSearchCV or any other method if necessary
    # Example: GridSearchCV for Logistic Regression
    param_grid = {
        'classifier__penalty': ['l2'],
        'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['lbfgs']
    }
    grid_search = GridSearchCV(model, param_grid, cv=3)
    grid_search.fit(train_data['text'], train_data['spam'])
    tuned_model = grid_search.best_estimator_

    # Evaluate the tuned model
    evaluate_model(tuned_model, train_data, validation_data)
    return tuned_model



In [28]:
# Fit the model on train data
model2 = fit_model2(train_data)

# Score and evaluate on train and validation data
evaluate_model(model2, train_data, validation_data)

# Fine-tune the model if necessary
print("Fine tuned model accuracy is:")
tuned_model=fine_tune_model2(model2, train_data, validation_data)

Train Accuracy: 0.9903972064600611
Validation Accuracy: 0.9703315881326352
Fine tuned model accuracy is:
Train Accuracy: 1.0
Validation Accuracy: 0.9930191972076788


## Model 3: SVM

In [16]:
from sklearn.svm import SVC

def fit_model3(train_data):
    # Split the data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(
        train_data['text'], train_data['spam'], test_size=0.2, random_state=42
    )
    
    # Define the pipeline with SVM Classifier
    model = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Text processing using TF-IDF
        ('classifier', SVC(random_state=42))  # Support Vector Machine (SVM) Classifier
    ])
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    return model


In [23]:
def fine_tune_model3(model, train_data, validation_data):
    # Fine-tune the model using GridSearchCV or any other method if necessary
    # Example: GridSearchCV for SVM
    param_grid = {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
                  'classifier__kernel': ['linear', 'rbf']}
    grid_search = GridSearchCV(model, param_grid, cv=3)
    grid_search.fit(train_data['text'], train_data['spam'])
    tuned_model = grid_search.best_estimator_

    # Evaluate the tuned model
    evaluate_model(tuned_model, train_data, validation_data)
    return tuned_model


In [24]:
# Fit the model on train data
model3 = fit_model3(train_data)

# Score and evaluate on train and validation data
evaluate_model(model3, train_data, validation_data)

# Fine-tune the model if necessary
print("Fine tuned model accuracy is:")
tuned_model=fine_tune_model3(model3, train_data, validation_data)

Train Accuracy: 0.9969445656918376
Validation Accuracy: 0.9895287958115183
Fine tuned model accuracy is:
Train Accuracy: 1.0
Validation Accuracy: 0.9947643979057592


In [31]:
#scoring the 3 benchmark models on test data
s1=score_model(model, test_data)
s2=score_model(model2, test_data)
s3=score_model(model3, test_data)
print("random forest classifier score:",s1)
print("logistic regression classifier score:",s2)
print("SVM score:",s3)

random forest classifier score: 0.9703315881326352
logistic regression classifier score: 0.9685863874345549
SVM score: 0.987783595113438


After evaluating the three models, it's evident that each performs pretty well for the given problem. The SVM model outperforms both the random forest classifier and logistic regression in terms of test score.While fine-tuning the models does not seem necessary, it results in 100% training accuracy for all three models (as opposed to 99% without it). The validation accuracy, after fine-tuning, experiences a marginal 1-2% change at best.