In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

file_path = 'loan_prediction (1).csv' 
data = pd.read_csv(file_path)
print("Original Data Shape: ", data.shape)
print(data.info())

numeric_columns = data.select_dtypes(include=['number']).columns
non_numeric_columns = data.select_dtypes(exclude=['number']).columns

imputer_numeric = SimpleImputer(strategy='mean')
data_numeric = imputer_numeric.fit_transform(data[numeric_columns])

print("Numeric data after imputation: ", data_numeric.shape)

data[numeric_columns] = pd.DataFrame(data_numeric, columns=numeric_columns)

imputer_non_numeric = SimpleImputer(strategy='most_frequent')
data_non_numeric = imputer_non_numeric.fit_transform(data[non_numeric_columns])

print("Non-numeric data after imputation: ", data_non_numeric.shape)

data[non_numeric_columns] = pd.DataFrame(data_non_numeric, columns=non_numeric_columns)

print("Final Data Shape: ", data.shape)

label_encoder = LabelEncoder()

for column in non_numeric_columns:
    data[column] = label_encoder.fit_transform(data[column])

X = data.drop(columns='target_column')  # Features
y = data['target_column']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

lr_predictions = lr_model.predict(X_test_scaled)
rf_predictions = rf_model.predict(X_test_scaled)

lr_mse = mean_squared_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)

rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print(f"Linear Regression MSE: {lr_mse}, R2 Score: {lr_r2}")
print(f"Random Forest MSE: {rf_mse}, R2 Score: {rf_r2}")


Original Data Shape:  (614, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None
Numeric data after imputation:  (614, 5)
Non-numeric data after imputation:

KeyError: "['target_column'] not found in axis"