In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [None]:
# Load the dataset
data = pd.read_csv('/content/780-testing-2.csv')

In [None]:
data.head()

Unnamed: 0,Target,fblk,nblk,anchor_exact_keyword,anchor_keyword,pa_score,ref_dom,outb_dom,mon_visits,keywords_ranked_on,...,h3kw,alt,altkw,linkin,linkout,urllen,urlkw,txtlen,txtkw,domain_age
0,High SEO,466,957,451,948,79,403,10,16000000.0,5000000,...,0,5,1,136,14,90,1,2175,55,23
1,High SEO,60000,10500,40702,44594,98,5300,123,6700000000.0,327000000,...,1,8,1,1723,642,43,1,7626,137,0
2,High SEO,1800,2400,634,1493,62,805,35,3900000.0,191000,...,1,23,9,828,74,57,1,2591,217,8
3,High SEO,968,1300,589,1300,85,466,1,33100000.0,5800000,...,0,3,0,62,5,45,1,1318,41,36
4,High SEO,3700,1000,254,761,70,492,6,24700000.0,2900000,...,7,48,4,139,49,54,1,2045,103,33


In [None]:
# Data preprocessing: Handling missing values and outliers
def handle_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    column = column.apply(lambda x: np.clip(x, lower_bound, upper_bound))
    return column

In [None]:

numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    data[col] = handle_outliers(data[col])

missing_value_cols = ['mon_visits', 'cwvm', 'cwvd']
for col in missing_value_cols:
    data[col].fillna(data[col].mean(), inplace=True)

In [None]:
# Label Encoding for categorical columns
categorical_cols = ['Target', 'mf', 'ssl', 'si']
label_encoder = preprocessing.LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

In [None]:
# Split data into features and target variable
X = data.drop(['Target'], axis=1)
y = data['Target']

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model Selection and Training
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Classifier', SVC()),
    ('Random Forest Classifier', RandomForestClassifier())
]

In [None]:
for model_name, model in models:
    model.fit(X_train, y_train)
    val_predictions = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_predictions)

    print(f"Model: {model_name}")
    print(f"Validation Accuracy: {val_accuracy}")

Model: Logistic Regression
Validation Accuracy: 0.4423076923076923
Model: Support Vector Classifier
Validation Accuracy: 0.5192307692307693
Model: Random Forest Classifier
Validation Accuracy: 0.5961538461538461


In [None]:

# Hyperparameter tuning for RandomForestClassifier
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8, 16, 32]
}

In [None]:
rf_classifier = RandomForestClassifier()
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
val_predictions = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)



In [None]:
print(f"Best Model Parameters: {best_params}")
print(f"Validation Accuracy with Best Model: {val_accuracy}")


Best Model Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Validation Accuracy with Best Model: 0.6474358974358975
