In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from google.colab import files


# Upload dataset
uploaded = files.upload()
filename = next(iter(uploaded))
df = pd.read_csv(filename)


# Explore dataset
df.info()
print(df.isnull().sum())
print(df.describe())


# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)


print(df.isnull().sum())


# Remove duplicates
df.drop_duplicates(inplace=True)
print(f'Duplicates removed, remaining: {df.duplicated().sum()}')


# Standardize categorical data example
if 'Gender' in df.columns:
    df['Gender'] = df['Gender'].str.strip().str.lower()
if 'SeniorCitizen' in df.columns:
    df['SeniorCitizen'] = df['SeniorCitizen'].replace({1: 'Yes', 0: 'No'})


# Convert data types
if 'Churn' in df.columns:
    df['Churn'] = df['Churn'].astype('category')
if 'CustomerID' in df.columns:
    df['CustomerID'] = df['CustomerID'].astype(str)


print(df.dtypes)


# Handle outliers using IQR for numeric columns only
Q1 = df.select_dtypes(include=np.number).quantile(0.25)
Q3 = df.select_dtypes(include=np.number).quantile(0.75)
IQR = Q3 - Q1


condition = ~((df.select_dtypes(include=np.number) < (Q1 - 1.5 * IQR)) |
              (df.select_dtypes(include=np.number) > (Q3 + 1.5 * IQR))).any(axis=1)
df_no_outliers = df[condition]
print(f"Shape after removing outliers: {df_no_outliers.shape}")


# Feature engineering: create 'Tenure' if date column exists
if 'ContractStartDate' in df.columns:
    df['Tenure'] = pd.to_datetime(df['ContractStartDate'], errors='coerce').apply(lambda x: (pd.to_datetime('today') - x).days // 30 if pd.notnull(x) else np.nan)
    df['Tenure'].fillna(df['Tenure'].median(), inplace=True)


# Scale numerical columns if they exist
num_cols = ['MonthlyCharges', 'TotalCharges']
for col in num_cols:
    if col in df.columns:
        scaler = MinMaxScaler()
        df[col] = scaler.fit_transform(df[[col]])


# Prepare features and target
if 'Churn' in df.columns:
    X = df.drop(columns=['Churn', 'CustomerID'], errors='ignore')
    y = df['Churn']


    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


# Export cleaned data
df.to_csv('Cleaned_Telecom_Customer_Churn.csv', index=False)


# Download the cleaned file
files.download('Cleaned_Telecom_Customer_Churn.csv')


