In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Use this data to train and perfect your model:
df = pd.read_csv('train.csv')

In [None]:
# Once your model is ready, generate predictions
# using these features
test = pd.read_csv('test.csv')

In [None]:
# and store your predictions in a file
# named `submission.csv`
# has to have this format:
pd.read_csv('sample_submission.csv', nrows=5)

In [None]:
pd.read_csv('sample_submission.csv', nrows=5).values

In [None]:
df.head()

In [None]:
# Step 2: Data Cleaning
# Check for missing values
print("\nMissing Values:\n", data.isnull().sum())

In [None]:
# Step 3: Exploratory Data Analysis (EDA)
# Check the data types and unique values for categorical features
print("\nData Types:\n", data.dtypes)
print("\nUnique values in categorical columns:")
for col in data.select_dtypes(include='object').columns:
    print(f"{col}: {data[col].unique()}")

In [None]:
# Visualize correlations (for numerical features)
plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Plot the distribution of the target variable (assuming a column like 'is_canceled')
if 'is_canceled' in data.columns:
    sns.countplot(x='is_canceled', data=data)
    plt.title("Cancellation Distribution")
    plt.show()

In [None]:
# Step 4: Feature Engineering
# Encode categorical variables
categorical_cols = data.select_dtypes(include='object').columns
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

In [None]:
# Split features and target
target_column = 'is_canceled'  # Replace with your target column
X = data.drop(columns=[target_column])
y = data[target_column]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 5: Model Training
# Train a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
rf_model.fit(X_train, y_train)

# Step 6: Model Evaluation
# Predict on the test set
y_pred = rf_model.predict(X_test)

In [None]:
# Evaluate performance
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))


In [None]:
# Step 7: Feature Importance (Optional)
feature_importances = rf_model.feature_importances_
important_features = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
important_features = important_features.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=important_features)
plt.title("Feature Importance")
plt.show()