In [2]:
# Task 1: ETL Pipeline using Pandas and Scikit-Learn

#This notebook performs a basic ETL (Extract, Transform, Load) process on customer data. It includes:
# Data loading
# Preprocessing (missing values, encoding)
# Feature scaling
# Data splitting
# Saving processed datasets


import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('/content/customers.csv')
print("Original Data:\n", data)

# Fill missing values
data = data.fillna(method='ffill')

# Encode categorical variables
le_gender = LabelEncoder()
le_country = LabelEncoder()
le_churn = LabelEncoder()

data['Gender'] = le_gender.fit_transform(data['Gender'])
data['Country'] = le_country.fit_transform(data['Country'])
data['Churn'] = le_churn.fit_transform(data['Churn'])

# Scale numeric columns
scaler = StandardScaler()
data[['Age', 'Salary']] = scaler.fit_transform(data[['Age', 'Salary']])

# Split the dataset
X = data.drop(['Churn', 'Name'], axis=1)
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save output files in Colab
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("\n Done. Files saved in Colab session:")
print("- X_train.csv\n- X_test.csv\n- y_train.csv\n- y_test.csv")

Original Data:
       Name  Gender   Age Country   Salary Churn
0    Alice  Female  25.0   India  40000.0    No
1      Bob    Male  30.0     USA  50000.0   Yes
2  Charlie    Male   NaN      UK  55000.0    No
3    Diana  Female  28.0   India      NaN   Yes
4      Eva  Female  35.0     USA  60000.0    No

 Done. Files saved in Colab session:
- X_train.csv
- X_test.csv
- y_train.csv
- y_test.csv


  data = data.fillna(method='ffill')
