<a href="https://colab.research.google.com/github/Sehajbirsingh/ML-1/blob/main/data_preprocessing_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
df=pd.read_csv('X_train.csv')


In [4]:
print(df.isnull().sum())

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64


In [11]:
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

# Handle categorical variables
# We'll use one-hot encoding for categorical variables
onehot = OneHotEncoder(sparse=False, handle_unknown='ignore')
ct = ColumnTransformer([("encoder", onehot, categorical_columns)], remainder='passthrough')

# Fit and transform the data
X = ct.fit_transform(df)



In [12]:
# Get the new column names
onehot_columns = ct.named_transformers_['encoder'].get_feature_names(categorical_columns)
all_columns = np.concatenate([onehot_columns, numerical_columns])

# Create a new dataframe with encoded features
df_encoded = pd.DataFrame(X, columns=all_columns, index=df.index)

# Add back the Loan_ID column
df_encoded['Loan_ID'] = df['Loan_ID']

# Display the first few rows of the processed dataset
print(df_encoded.head())

AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names'

In [13]:
# Simple Loan Data Preprocessing Task

import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('X_train.csv')

# Display the first few rows and basic information
print(df.head())
print(df.info())

# Check for missing values
print(df.isnull().sum())

# Identify categorical and numerical columns
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

# Handle categorical variables using pandas get_dummies
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Reorder columns to group encoded features
columns_order = ['Loan_ID'] + [col for col in df_encoded.columns if col.startswith(tuple(categorical_columns))] + numerical_columns
df_encoded = df_encoded[columns_order]

# Display the first few rows of the processed dataset
print(df_encoded.head())

# Display basic information about the processed dataset
print(df_encoded.info())

# Summary statistics of the processed dataset
print(df_encoded.describe())

# Save the processed dataset
df_encoded.to_csv('processed_loan_data.csv', index=False)
print("Processed data saved to 'processed_loan_data.csv'")

    Loan_ID Gender Married Dependents Education Self_Employed  \
0  LP001032   Male      No          0  Graduate            No   
1  LP001824   Male     Yes          1  Graduate            No   
2  LP002928   Male     Yes          0  Graduate            No   
3  LP001814   Male     Yes          2  Graduate            No   
4  LP002244   Male     Yes          0  Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             4950                0.0         125               360   
1             2882             1843.0         123               480   
2             3000             3416.0          56               180   
3             9703                0.0         112               360   
4             2333             2417.0         136               360   

   Credit_History Property_Area  
0               1         Urban  
1               1     Semiurban  
2               1     Semiurban  
3               1         Urban  
4           

In [14]:
import pandas as pd
import numpy as np

# Step 2: Loading the dataset
df = pd.read_csv('X_train.csv')

# Step 3: Exploring the dataset
print("Data Preview:")
print(df.head())

print("\nData Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

Data Preview:
    Loan_ID Gender Married Dependents Education Self_Employed  \
0  LP001032   Male      No          0  Graduate            No   
1  LP001824   Male     Yes          1  Graduate            No   
2  LP002928   Male     Yes          0  Graduate            No   
3  LP001814   Male     Yes          2  Graduate            No   
4  LP002244   Male     Yes          0  Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             4950                0.0         125               360   
1             2882             1843.0         123               480   
2             3000             3416.0          56               180   
3             9703                0.0         112               360   
4             2333             2417.0         136               360   

   Credit_History Property_Area  
0               1         Urban  
1               1     Semiurban  
2               1     Semiurban  
3               1         Urban 

In [15]:
# We can use one-hot encoding or label encoding. Let's use label encoding for simplicity here.
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

print("\nData after Label Encoding:")
print(df.head())



Data after Label Encoding:
    Loan_ID  Gender  Married  Dependents  Education  Self_Employed  \
0  LP001032       1        0           0          0              0   
1  LP001824       1        1           1          0              0   
2  LP002928       1        1           0          0              0   
3  LP001814       1        1           2          0              0   
4  LP002244       1        1           0          0              0   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             4950                0.0         125               360   
1             2882             1843.0         123               480   
2             3000             3416.0          56               180   
3             9703                0.0         112               360   
4             2333             2417.0         136               360   

   Credit_History  Property_Area  
0               1              2  
1               1              1  
2               1  

In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

print("\nData after Normalization:")
print(df.head())


Data after Normalization:
    Loan_ID  Gender  Married  Dependents  Education  Self_Employed  \
0  LP001032       1        0           0          0              0   
1  LP001824       1        1           1          0              0   
2  LP002928       1        1           0          0              0   
3  LP001814       1        1           2          0              0   
4  LP002244       1        1           0          0              0   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0         0.059369           0.000000    0.196277          0.729730   
1         0.033791           0.054467    0.192893          1.000000   
2         0.035250           0.100955    0.079526          0.324324   
3         0.118157           0.000000    0.174281          0.729730   
4         0.027001           0.071431    0.214890          0.729730   

   Credit_History  Property_Area  
0             1.0              2  
1             1.0              1  
2             1.0   

In [17]:
df.to_csv('processed_data.csv', index=False)
print("Processed data saved as 'processed_data.csv'")

Processed data saved as 'processed_data.csv'
