In [19]:
!pip install catboost
!pip install lightgbm
!pip install category_encoders
!pip install imbalanced-learn




In [20]:
# Import Libraries
import pandas as pd
import numpy as np
from google.colab import drive

# Data manipulation and preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import StackingClassifier

# Machine learning models
from catboost import CatBoostClassifier
import lightgbm as lgbm
from xgboost import XGBClassifier

# Handling imbalanced data
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline as ImbPipeline

# Additional tools
import category_encoders as ce
import warnings
warnings.filterwarnings('ignore')


In [21]:
# Load data
train_data = pd.read_csv("/content/final_proj_data.csv")
test_data = pd.read_csv('/content/final_proj_test.csv')

# Print initial data info
print("Training Data Shape:", train_data.shape)
print("\nTest Data Shape:", test_data.shape)
print("\nTraining Data Info:")
print(train_data.info())


Training Data Shape: (10000, 231)

Test Data Shape: (2500, 230)

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 231 entries, Var1 to y
dtypes: float64(191), int64(2), object(38)
memory usage: 17.6+ MB
None


In [22]:
def analyze_and_select_numerical_features(X, y, correlation_threshold=0.8):
    """
    Analyze correlations and evaluate feature importance before removal
    """
    # Get numeric features
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

    # Calculate correlation matrix
    corr_matrix = X[numeric_features].corr().abs()

    # Find highly correlated pairs
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] > correlation_threshold:
                feat1, feat2 = corr_matrix.columns[i], corr_matrix.columns[j]
                corr_val = corr_matrix.iloc[i, j]
                high_corr_pairs.append((feat1, feat2, corr_val))

    if high_corr_pairs:
        print("\nHighly correlated feature pairs:")
        for f1, f2, corr in high_corr_pairs:
            print(f"{f1} - {f2}: {corr:.3f}")

    # Initial feature importance using CatBoost
    print("\nCalculating initial feature importance...")
    initial_model = CatBoostClassifier(
        iterations=100,
        learning_rate=0.1,
        depth=6,
        verbose=0
    )

    # Fill NaN values for initial importance calculation
    X_filled = X[numeric_features].fillna(X[numeric_features].median())
    initial_model.fit(X_filled, y)

    # Get feature importance
    importance_df = pd.DataFrame({
        'feature': numeric_features,
        'importance': initial_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nTop 10 most important features:")
    print(importance_df.head(10))

    # Strategy: Keep both correlated features if both are in top important features
    top_important = set(importance_df.head(20)['feature'])
    features_to_remove = set()

    for feat1, feat2, corr in high_corr_pairs:
        # If both features are important, keep them
        if feat1 in top_important and feat2 in top_important:
            continue
        # If one is important, remove the other
        elif feat1 in top_important:
            features_to_remove.add(feat2)
        elif feat2 in top_important:
            features_to_remove.add(feat1)
        # If neither is important, remove the one with lower importance
        else:
            feat1_imp = importance_df[importance_df['feature'] == feat1]['importance'].iloc[0]
            feat2_imp = importance_df[importance_df['feature'] == feat2]['importance'].iloc[0]
            if feat1_imp < feat2_imp:
                features_to_remove.add(feat1)
            else:
                features_to_remove.add(feat2)

    selected_features = [f for f in numeric_features if f not in features_to_remove]

    print(f"\nSelected {len(selected_features)} features")
    print(f"Removed {len(features_to_remove)} features")

    return selected_features, features_to_remove, importance_df


In [23]:
def identify_cat_feature_types(data):
    """Identify numeric and categorical features"""
    categorical_features = data.select_dtypes(include=['object']).columns

    # Split categorical features based on cardinality
    low_cardinality = []
    high_cardinality = []
    for col in categorical_features:
        if data[col].nunique() <= 10:
            low_cardinality.append(col)
        else:
            high_cardinality.append(col)

    return low_cardinality, high_cardinality

In [28]:
# Drop columns with 100% missing values
missing_percentage = train_data.isnull().mean() * 100
columns_to_drop = missing_percentage[missing_percentage == 100].index
train_data_cleaned = train_data.drop(columns=columns_to_drop)

In [37]:
import numpy as np
import random
import os

def set_all_seeds(seed=42):
    """Set all random seeds for reproducibility"""
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_all_seeds()

In [39]:
# Prepare data
X = train_data_cleaned.drop('y', axis=1)
y = train_data_cleaned['y']

# Analyze features
selected_features, features_to_remove, importance_df = analyze_and_select_numerical_features(X, y)
low_cardinality, high_cardinality = identify_cat_feature_types(train_data_cleaned)

print("\nFeature types found:")
print(f"Numeric features ({len(selected_features)}): {selected_features}")
print(f"Low cardinality features ({len(low_cardinality)}): {low_cardinality}")
print(f"High cardinality features ({len(high_cardinality)}): {high_cardinality}")

# Split data with stratification
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Calculate weights for balanced classes
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"\nPositive class weight: {pos_weight:.2f}")

# Create preprocessing pipeline optimized for telecom data

preprocessor = ColumnTransformer(
    transformers=[
        # Numeric features - use robust scaling due to possible outliers
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  # Changed to median
            ('scaler', RobustScaler())
        ]), selected_features),

        # Low cardinality categorical features
        ('low_card', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), low_cardinality),

        # High cardinality categorical features
        ('high_card', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', ce.CatBoostEncoder(sigma=0.05))
        ]), high_cardinality)
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

# Process features
print("\nProcessing features...")
X_train_processed = preprocessor.fit_transform(X_train, y_train)
X_val_processed = preprocessor.transform(X_val)

print(f"\nProcessed training data shape: {X_train_processed.shape}")
print(f"Processed validation data shape: {X_val_processed.shape}")

# Create model optimized for telecom churn
model = CatBoostClassifier(
        iterations=2000,
        learning_rate=0.02,
        depth=6,
        l2_leaf_reg=3,
        min_data_in_leaf=10,
        random_strength=0,
        scale_pos_weight=pos_weight,
        random_state=42,
        verbose=100,
        loss_function='Logloss',
        eval_metric='AUC',
        bootstrap_type='No',
        leaf_estimation_method='Newton',
        max_bin=256
    )

print("\nTraining model...")
model.fit(
    X_train_processed,
    y_train,
    eval_set=[(X_val_processed, y_val)],
    early_stopping_rounds=50,
    verbose=100
)

#Evaluate on validation set
y_val_pred = model.predict(X_val_processed)

print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))


Highly correlated feature pairs:
Var1 - Var9: 0.926
Var1 - Var41: 0.872
Var1 - Var63: 0.873
Var1 - Var66: 0.926
Var1 - Var77: 0.904
Var1 - Var129: 0.837
Var1 - Var156: 0.926
Var1 - Var187: 0.971
Var9 - Var63: 0.990
Var9 - Var66: 1.000
Var9 - Var77: 0.992
Var9 - Var121: 0.955
Var9 - Var129: 0.977
Var9 - Var156: 1.000
Var9 - Var187: 0.974
Var12 - Var62: 0.987
Var12 - Var63: 0.813
Var12 - Var121: 0.890
Var12 - Var129: 0.840
Var17 - Var18: 0.945
Var17 - Var88: 0.989
Var17 - Var99: 0.975
Var17 - Var101: 0.960
Var17 - Var127: 0.972
Var17 - Var128: 0.989
Var17 - Var145: 0.986
Var17 - Var158: 0.893
Var17 - Var164: 0.937
Var17 - Var174: 0.970
Var17 - Var179: 0.951
Var18 - Var88: 0.968
Var18 - Var99: 0.960
Var18 - Var101: 0.973
Var18 - Var127: 0.984
Var18 - Var128: 0.968
Var18 - Var145: 0.968
Var18 - Var158: 0.909
Var18 - Var164: 0.935
Var18 - Var174: 0.946
Var18 - Var179: 0.938
Var21 - Var22: 1.000
Var21 - Var25: 0.821
Var21 - Var83: 0.814
Var21 - Var109: 0.805
Var21 - Var112: 0.824
Var21 - Va

In [26]:
# Process full training data
print("\nTraining final model on full dataset...")
X_full_processed = preprocessor.fit_transform(X, y)
model.fit(X_full_processed, y, verbose=100)



Training final model on full dataset...
0:	total: 27.6ms	remaining: 55.2s
100:	total: 2.67s	remaining: 50.3s
200:	total: 6.88s	remaining: 1m 1s
300:	total: 9.22s	remaining: 52.1s
400:	total: 11.7s	remaining: 46.7s
500:	total: 14.2s	remaining: 42.5s
600:	total: 16.8s	remaining: 39.1s
700:	total: 21s	remaining: 38.8s
800:	total: 23.3s	remaining: 34.9s
900:	total: 25.7s	remaining: 31.4s
1000:	total: 28.1s	remaining: 28s
1100:	total: 30.9s	remaining: 25.2s
1200:	total: 35.5s	remaining: 23.6s
1300:	total: 38.4s	remaining: 20.6s
1400:	total: 40.7s	remaining: 17.4s
1500:	total: 43.2s	remaining: 14.4s
1600:	total: 46.9s	remaining: 11.7s
1700:	total: 49.9s	remaining: 8.77s
1800:	total: 52.4s	remaining: 5.79s
1900:	total: 54.7s	remaining: 2.85s
1999:	total: 57.1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7e6524341b70>

In [27]:
test_data_cleaned = test_data.drop(columns=columns_to_drop)

X_test_processed = preprocessor.transform(test_data_cleaned)

y_test_pred = model.predict(X_test_processed)


submission = pd.DataFrame({
    'index': range(len(y_test_pred)),
    'y': y_test_pred
})

# Save submission
submission.to_csv('/content/improved_submission.csv', index=False)
print("\nSubmission file saved as 'improved_submission.csv'")


Submission file saved as 'improved_submission.csv'
