In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vehicle/Car details v3.csv


In [None]:
"""
# T15 Practise Solution

def read_data(spark, customSchema):
    '''
    spark_session: spark
    customSchema : we have given the custom schema
    '''
    print("--------------------")
    print ("Starting read_data")
    print("--------------------")
    
    #Mention the Bucket name inside the bucket name variable
    bucket_name = "loan-data926529466287266"
    s3_input_ path = "s3://" + bucket_name + "/inputfile/loan_data.csv"
    df = spark.read.csv(s3_input_path, header=True, schema=customSchema)
    
    return df


def clean_data(input_df):
    '''
    for input file: input_df is output of read_data function
    '''
    print("--------------------")
    print("Starting clean_data")
    print("--------------------")
    
    df = input_df.dropna().dropDuplicates()
    df = df.filter(df.purpose!='null')
    
    return df


def s3_load_data(data, file_name):
    '''
    data : the output data of refult_1 and result_2 function 
    file name : the name of the output to be stored inside the s3
    '''
    #Mention the bucket name inside the bucket_name variable
    bucket_name = "loan-data926529466287266"	
    output_path = "s3://" + bucket_name + "/output"+ file_name
    #output_path = "s3://" + bucket_name + "/output/"+ file_name
    
    if data.count() !=0:
        print("Loading the data", output_path)
        #write the s3 load data command here
        data.coalesce(1).write.csv(output_path, header=True, mode="overwrite")
    
    else:
        print("Empty dataframe, hence cannot save the data", output_path)


def result_1(input_df):
    '''
    for input file: input_df is output of clean_data function
    '''
    print("--------------------------")
    print("Starting result_1")
    print("--------------------------")
    
    df = input_df.filter((col("purpose")=="educational")|(col("purpose")=="small_business"))
    df = df.withColumn("income_to_installment_ratio", col("log_annual_inc")/col("installment"))
    df = df.withColumn("int_rate_category",
                       when(col("int_rate")<0.1, "low")
                       .when((col("int_rate") >= 0.1) & (col("int_rate") < 0.15), "medium" )
                       .otherwise ("high")
                       )
    
    df = df.withColumn("high_risk_borrower",
                      when((col("dti") > 20) | (col("fico") < 700) | (col("revol_util") > 80), 1) # Corrected syntax
                      .otherwise(0)
                      )
    
    return df


def result_2(input_df):
    '''
    for input file: input_df is output of clean_data function
    '''
    print("--------------------------")
    print ("Starting result_2")
    print("--------------------------")
    
    df = input_df.groupBy("purpose").agg(
         (sum(col("not_fully_paid")) / count("*")).alias("default_rate")
         )
    df = df.withColumn("default_rate", round(col("default_rate"), 2))
    
    return df


def redshift_load_data(data):
    if data.count() != 0:
        print("Loading the data into Redshift...")
        jdbcUrl = "jdbc:redshift://emr-spark-redshift.cjgnpeot7x5i.us-east-1.redshift.amazonaws.com:5439/dev"
        username = "awsuser" #Mention redshift username
        password = "Awsuser1" #Mention redshift password
        table_name = "result_2" #Mention redshift table name
    
        #Write the redshift load data command
        data.write \
            .format("jdbc") \
            .option("url", jdbcUrl) \
            .option("dbtable", table_name) \
            .option("user", username) \
            .option("password", password) \
            .mode("overwrite") \
            .save()
    
    else:
        print("Empty dataframe, hence cannot load the data")
"""

In [None]:
"""
#### Import statements here
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report 

import warnings
import boto3
from sagemaker import get_execution_role

warnings.filterwarnings('ignore')
####



#### Import the dataset from S3
bucket="loan-data602607864436400"
folder_name = "loan_cleaned_data"
data_key = "loan_cleaned_data.csv"
data_location = f's3://{bucket}/{folder_name}/{data_key}'


data = pd.read_csv(data_location)
data.head()


data = pd.get_dummies(data,columns=['purpose'], dtype=int)
data.head()


df_majority=data[data['not_fully_paid']==0]
df_minority=data[data['not_fully_paid']==1]


# Handle the imbalanced data using resample method and oversample the minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True, 
                                 n_samples = df_majority.shape[0],
                                 random_state=42)



# Concatenate the upsampled data records with the majority class records and shuffle the resultant dataframe
df_balanced=pd.concat([df_majority,df_minority_upsampled])
print(df_balanced['not_fully_paid'].value_counts())


# Create X and y data for train-test split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import  RandomForestClassifier

X = df_balanced.drop(columns=['sl_no','not_fully_paid'])
y = df_balanced['not_fully_paid']


# Split the data 
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=42)



# Train a Random Forest Classifier model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)



# Predict using the trained Random Forest Classifier model
from sklearn.metrics import classification_report
y_pred = rf.predict(X_test)



# Print the classification report 
print(classification_report(y_test, y_pred))



import tempfile

import joblib
BUCKET_NAME ="loan-data602607864436400"
with tempfile.NamedTemporaryFile() as tmp:
    joblib.dump(rf, tmp.name)
    tmp.flush()

    s3= boto3.client('s3')
    
    s3.upload_file(tmp.name, BUCKET_NAME , "model.pkl")

"""

In [None]:
"""
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn. compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import boto3
import pandas as pd
from sagemaker import get_execution_role
import numpy as np
import warnings


### Create S3 path for the dataset
# bucket = 
# data_key = 
# data_location = f's3://{bucket}/{data_key}'


### Load the dataset
data = pd.read_csv(data_location)


### Analyze the dataset
data.head()


### Create new feature: age of the car
data['car_age'] = 2024 - data['year']


### Drop the columns
del data['name'], data['year']


### Define the features and target variable
X = data.drop(columns=['selling_price'])
y = data['selling_price']


# numerical_features = ['km_driven', 'mileage', 'engine', 'max_power', 'seats', 'car_age']
numerical_features = ['km_driven', 'seats', 'car_age']
categorical_features = ['fuel', 'seller_type', 'transmission', 'owner']

### Log transformation for skewed numerical features
X['km_driven'] = np.log1p(X['km_driven'])
y = np.log1p(y)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=8))
])

### Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)


param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10]
}

### Create the model
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=1)


grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


### Make predictions
y_pred = best_model.predict(X_test)

### Transform predictions back to original scale
y_test = np.expm1(y_test)
y_pred = np.expm1(y_pred)


### Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

### Print the metrics
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse} ')
print(f'R2: {r2} ')
print(f'Best Parameters: {grid_search.best_params_}')

"""