In [2]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("/content/Loan_data.csv")

# 1. Viewing and Inspecting DataFrames
print(" Shape of dataset:", df.shape)

print("\n Data Types and Null Values:")
print(df.info())

print("\n First 5 Rows:")
print(df.head())

print("\n Missing values in each column:")
print(df.isnull().sum())

# 2. Filtering and Subsetting Data

# Applicants with income > 5000
high_income = df[df['ApplicantIncome'] > 5000]
print(f"\n Number of high income applicants (>5000): {high_income.shape[0]}")

# Approved loans for self-employed applicants
approved_self_employed = df[(df['Self_Employed'] == 'Yes') & (df['Loan_Status'] == 'Y')]
print(f" Approved self-employed loans: {approved_self_employed.shape[0]}")

# Urban applicants with coapplicants
urban_with_coapp = df[(df['Property_Area'] == 'Urban') & (df['CoapplicantIncome'] > 0)]
print(f" Urban applicants with coapplicants: {urban_with_coapp.shape[0]}")

# 3. Descriptive Statistics

# LoanAmount column (drop NaNs)
loan_amt = df['LoanAmount'].dropna()

mean_loan = loan_amt.mean()
median_loan = loan_amt.median()
mode_loan = loan_amt.mode()[0]
range_loan = loan_amt.max() - loan_amt.min()
variance_loan = loan_amt.var()
std_loan = loan_amt.std()

print("\n LoanAmount Statistics:")
print(f"Mean: {mean_loan:.2f}")
print(f"Median: {median_loan}")
print(f"Mode: {mode_loan}")
print(f"Range: {range_loan}")
print(f"Variance: {variance_loan:.2f}")
print(f"Standard Deviation: {std_loan:.2f}")

# ApplicantIncome column
income = df['ApplicantIncome']

mean_income = income.mean()
median_income = income.median()
mode_income = income.mode()[0]
range_income = income.max() - income.min()
variance_income = income.var()
std_income = income.std()

print("\n ApplicantIncome Statistics:")
print(f"Mean: {mean_income:.2f}")
print(f"Median: {median_income}")
print(f"Mode: {mode_income}")
print(f"Range: {range_income}")
print(f"Variance: {variance_income:.2f}")
print(f"Standard Deviation: {std_income:.2f}")

# 4. Summary Table
print("\n Summary Statistics for All Numeric Columns:")
print(df.describe())


# 5. Group Analysis: Mean LoanAmount by Education
loan_by_education = df.groupby('Education')['LoanAmount'].mean()
print("\n Mean LoanAmount by Education:")
print(loan_by_education)


 Shape of dataset: (614, 13)

 Data Types and Null Values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None

 First 5 Rows:
    Loan_ID Gender Married Depen