In [4]:
!pip install lightgbm


Defaulting to user installation because normal site-packages is not writeable


In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

train_df = pd.read_csv("TrainOnMe.csv")

if "Unnamed: 0" in train_df.columns:
    train_df.drop(columns=["Unnamed: 0"], inplace=True)

train_df.replace({"Boom!": np.nan, "F": np.nan}, inplace=True)

for col in train_df.columns:
    train_df[col] = pd.to_numeric(train_df[col], errors="ignore")

num_cols = train_df.select_dtypes(include=["number"]).columns
cat_cols = train_df.select_dtypes(include=["object"]).columns

train_df[cat_cols] = train_df[cat_cols].astype(str)

num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

train_df[num_cols] = num_imputer.fit_transform(train_df[num_cols])
train_df[cat_cols] = cat_imputer.fit_transform(train_df[cat_cols])

train_df.to_csv("TrainOnMe_Cleaned.csv", index=False)


  train_df[col] = pd.to_numeric(train_df[col], errors="ignore")


In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

eval_df = pd.read_csv("EvaluateOnMe.csv")

if "Unnamed: 0" in eval_df.columns:
    eval_df.drop(columns=["Unnamed: 0"], inplace=True)

eval_df.replace({"?": np.nan, "F": np.nan, "Boom!": np.nan}, inplace=True)

for col in eval_df.columns:
    eval_df[col] = pd.to_numeric(eval_df[col], errors="ignore")

num_cols = eval_df.select_dtypes(include=["number"]).columns
cat_cols = eval_df.select_dtypes(include=["object"]).columns

num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

eval_df[num_cols] = num_imputer.fit_transform(eval_df[num_cols])
eval_df[cat_cols] = cat_imputer.fit_transform(eval_df[cat_cols])

def cap_outliers(df, column):
    Q1, Q3 = df[column].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    df[column] = np.clip(df[column], lower_bound, upper_bound)

for col in num_cols:
    cap_outliers(eval_df, col)

eval_df.to_csv("EvaluateOnMe_Cleaned.csv", index=False)


  eval_df[col] = pd.to_numeric(eval_df[col], errors="ignore")


In [3]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("TrainOnMe_Cleaned.csv")

ordinal_cols = ["x5"]
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
train_df[ordinal_cols] = ordinal_encoder.fit_transform(train_df[ordinal_cols])
joblib.dump(ordinal_encoder, "ordinal_encoder.pkl")

if "x7" in train_df.columns:
    train_df = pd.get_dummies(train_df, columns=["x7"], drop_first=True)

num_cols = train_df.select_dtypes(include=["number"]).columns

num_imputer = SimpleImputer(strategy="mean")
train_df[num_cols] = num_imputer.fit_transform(train_df[num_cols])
joblib.dump(num_imputer, "num_imputer.pkl")

scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
joblib.dump(scaler, "scaler.pkl")

power_transformer = PowerTransformer()
train_df[num_cols] = power_transformer.fit_transform(train_df[num_cols])
joblib.dump(power_transformer, "power_transformer.pkl")

train_df.to_csv("TrainOnMe_FeatureEngineered.csv", index=False)

label_encoder = LabelEncoder()
train_df["y"] = label_encoder.fit_transform(train_df["y"])
joblib.dump(label_encoder, "label_encoder.pkl")

train_df.to_csv("TrainOnMe_Encoded.csv", index=False)

X = train_df.drop("y", axis=1)
y = train_df["y"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.to_csv("X_train.csv", index=False)
X_val.to_csv("X_val.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_val.to_csv("y_val.csv", index=False)


In [4]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder

eval_df = pd.read_csv("EvaluateOnMe_Cleaned.csv")

num_imputer = joblib.load("num_imputer.pkl")
ordinal_encoder = joblib.load("ordinal_encoder.pkl")
scaler = joblib.load("scaler.pkl")
power_transformer = joblib.load("power_transformer.pkl")

num_cols = eval_df.select_dtypes(include=["number"]).columns
ordinal_cols = ["x5"]  

eval_df[num_cols] = num_imputer.transform(eval_df[num_cols])

eval_df[ordinal_cols] = ordinal_encoder.transform(eval_df[ordinal_cols])

if "x7" in eval_df.columns:
    eval_df = pd.get_dummies(eval_df, columns=["x7"], drop_first=True)

X_train = pd.read_csv("X_train.csv")
missing_cols = set(X_train.columns) - set(eval_df.columns)
for col in missing_cols:
    eval_df[col] = 0
eval_df = eval_df[X_train.columns]

eval_df[num_cols] = scaler.transform(eval_df[num_cols])
eval_df[num_cols] = power_transformer.transform(eval_df[num_cols])

eval_df.to_csv("EvaluateOnMe_Preprocessed.csv", index=False)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold, cross_val_score
import joblib
import pandas as pd
import numpy as np

# Load Preprocessed Training Data
train_df = pd.read_csv("TrainOnMe_Encoded.csv")

# Feature Engineering: Create an interaction term
X = train_df.drop("y", axis=1)
y = train_df["y"]

class_counts = y.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]  # Adjust if your minority class is different

# Define Multiple Train-Test Split Percentages
test_sizes = [0.1, 0.2, 0.3, 0.4]  # 90-10, 80-20, ..., 40-60
k_folds = [ 2, 3, 4]  # You can modify this array to change K-Fold values

best_split = None
best_accuracy = 0
best_model = None

# Define Hyperparameter Grid
param_grid = {
    "n_estimators": [100, 250, 500, 700, 1000],  
    "max_depth": [None, 10, 20, 30, 40, 50],  
    "min_samples_split": [2, 5, 10, 15],  
    "min_samples_leaf": [1, 2, 4, 8],  
    "max_features": ["sqrt", "log2", None]  
}

# Step 1: Loop Over Different Train-Test Splits
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=42
    )
    
    # Initialize RandomForestClassifier
    rf = RandomForestClassifier(random_state=42)
    
    # Perform Randomized Search (Hyperparameter Tuning)
    random_search = RandomizedSearchCV(
        rf, param_distributions=param_grid, 
        n_iter=50,  
        cv=5,  
        scoring="accuracy",
        n_jobs=-1,  
        verbose=0,
        random_state=42
    )
    
    # Train the model with hyperparameter tuning
    random_search.fit(X_train, y_train)
    best_model_for_split = random_search.best_estimator_
    
    # Step 2: Perform K-Fold Cross-Validation
    for fold in k_folds:
        kf = KFold(n_splits=fold, shuffle=True, random_state=42)
        scores = cross_val_score(best_model_for_split, X_train, y_train, cv=kf, scoring='accuracy')
        accuracy = np.mean(scores)
        print(f"Train-Test Split: {test_size}, K-Fold: {fold}, Accuracy: {accuracy:.4f}")
    
    # Track Best Model and Split
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = best_model_for_split
        best_split = (X_train, X_test, y_train, y_test, test_size)

# Step 3: Train on the Best Split
X_train_best, X_test_best, y_train_best, y_test_best, best_test_size = best_split
best_model.fit(X_train_best, y_train_best)

# Step 4: Save the Best Model
joblib.dump(best_model, "final_model.pkl")

# Step 5: Save Best Train-Test Split Data
pd.DataFrame(X_test_best).to_csv("X_test_best.csv", index=False)
pd.DataFrame(y_test_best).to_csv("y_test_best.csv", index=False)

# Display Best Model Results
print(f"Best Test Split: {best_test_size:.1%}")
print(f"Best Model Trained with Accuracy = {best_accuracy:.4f}")
print(f"Final best model has been trained and saved as 'final_model.pkl'.")
print(f"Best test dataset has been saved as 'X_test_best.csv' and 'y_test_best.csv'.")

Train-Test Split: 0.1, K-Fold: 2, Accuracy: 0.6813
Train-Test Split: 0.1, K-Fold: 3, Accuracy: 0.6853
Train-Test Split: 0.1, K-Fold: 4, Accuracy: 0.6871
Train-Test Split: 0.2, K-Fold: 2, Accuracy: 0.6687
Train-Test Split: 0.2, K-Fold: 3, Accuracy: 0.6728
Train-Test Split: 0.2, K-Fold: 4, Accuracy: 0.6713
Train-Test Split: 0.3, K-Fold: 2, Accuracy: 0.6780
Train-Test Split: 0.3, K-Fold: 3, Accuracy: 0.6871
Train-Test Split: 0.3, K-Fold: 4, Accuracy: 0.6914
Train-Test Split: 0.4, K-Fold: 2, Accuracy: 0.6737
Train-Test Split: 0.4, K-Fold: 3, Accuracy: 0.6817
Train-Test Split: 0.4, K-Fold: 4, Accuracy: 0.6787
Best Test Split: 30.0%
Best Model Trained with Accuracy = 0.6914
Final best model has been trained and saved as 'final_model.pkl'.
Best test dataset has been saved as 'X_test_best.csv' and 'y_test_best.csv'.


In [None]:
import pandas as pd
import joblib

best_model = joblib.load("final_model.pkl")
eval_df = pd.read_csv("EvaluateOnMe_Preprocessed.csv")
predictions = best_model.predict(eval_df)
label_encoder = joblib.load("label_encoder.pkl")  # Load the saved LabelEncoder
predictions_labels = label_encoder.inverse_transform(predictions)  # Correct label mapping
with open("Label.txt", "w", encoding="utf-8") as f:
    for label in predictions_labels:
        f.write(label + "\n")
print("\n Predictions have been generated and saved in 'Label.txt'.")
print(f"Total Predictions: {len(predictions_labels)}")
print("First 5 Predictions:")
print(predictions_labels[:5])



 Predictions have been generated and saved in 'Label.txt'.
Total Predictions: 10000
First 5 Predictions:
['Antrophic' 'OpenAI' 'Antrophic' 'OpenAI' 'Mistral']
