Exploratory Data Analysis (EDA)


Scaling/Normalization: Standardize the dataset to ensure consistent data scales.
Fill Missing Values: Handle missing data points appropriately.
Feature Selection & Engineering: Identify and create relevant features for the model


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv(r"D:\data science\capstone project\587_cap_proj_dataset_v1.0\train(1).csv")

# Display basic statistics and information
print(df.describe())
print(df.info())

# Handling missing values
df_filled = df.fillna(df.mean())  # Filling missing values with mean

# Feature Scaling/Normalization
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_filled), columns=df_filled.columns)

# Extracting features and target
features = df_scaled.drop(columns=['target'])  # Assuming 'target' is the column to predict
target = df_scaled['target']


                 id         target      ps_ind_01  ps_ind_02_cat  \
count  5.952120e+05  595212.000000  595212.000000  595212.000000   
mean   7.438036e+05       0.036448       1.900378       1.358943   
std    4.293678e+05       0.187401       1.983789       0.664594   
min    7.000000e+00       0.000000       0.000000      -1.000000   
25%    3.719915e+05       0.000000       0.000000       1.000000   
50%    7.435475e+05       0.000000       1.000000       1.000000   
75%    1.115549e+06       0.000000       3.000000       2.000000   
max    1.488027e+06       1.000000       7.000000       4.000000   

           ps_ind_03  ps_ind_04_cat  ps_ind_05_cat  ps_ind_06_bin  \
count  595212.000000  595212.000000  595212.000000  595212.000000   
mean        4.423318       0.416794       0.405188       0.393742   
std         2.699902       0.493311       1.350642       0.488579   
min         0.000000      -1.000000      -1.000000       0.000000   
25%         2.000000       0.000000       

Machine Learning Modeling


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'F1 Score: {f1}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')


In [None]:
print(df.describe())


In [None]:
target_distribution = df['target'].value_counts(normalize=True)
print(target_distribution)


In [None]:
categorical_features = df.select_dtypes(include=['object']).columns
print(f'Categorical Features: {len(categorical_features)}')


In [None]:
missing_values = df.isnull().sum().sort_values(ascending=False)
print(missing_values.head(2))


In [None]:
num_features_with_missing = missing_values[missing_values > 0].count()
print(f'Total Features with Missing Values: {num_features_with_missing}')


In [None]:
correlation_matrix = df.corr()
print(correlation_matrix)


In [None]:
ordinal_features = ['ord_feature1', 'ord_feature2']
ord_correlation = df[ordinal_features].corr()
print(ord_correlation)


In [None]:
df_encoded = pd.get_dummies(df, columns=categorical_features)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')


XGBoost


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'F1 Score: {f1_score(y_test, y_pred)}')


In [None]:
num_features_after_encoding = df_encoded.shape[1]
print(f'Number of Features after Encoding: {num_features_after_encoding}')


Neural Networks


In [None]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(100,))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'F1 Score: {f1_score(y_test, y_pred)}')


In [None]:
best_model = model  # Choose based on metrics like recall


Improvement with Ensemble Methods:


In [None]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'F1 Score: {f1_score(y_test, y_pred)}')
