In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
import seaborn as sns

In [4]:
# Load in the big dataset

dataframe = pd.read_csv("data.csv")

In [5]:
fraud_df = dataframe.sample(10000, random_state=42)

In [6]:
fraud_df.head()

Unnamed: 0,amount_scaled,oldbalanceOrg_scaled,newbalanceOrig_scaled,oldbalanceDest_scaled,newbalanceDest_scaled,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,isFraud
3737323,0.335721,-0.286467,-0.18452,-0.214971,-0.350011,1.0,0.0,0.0,0.0,0.0,0
264914,-0.347639,-0.283388,-0.290957,-0.374245,-0.390288,0.0,0.0,0.0,1.0,0.0,0
85647,-0.046004,-0.258694,-0.214056,-0.303376,-0.374125,1.0,0.0,0.0,0.0,0.0,0
5899326,2.956022,-0.293227,-0.296953,0.751735,1.175336,0.0,0.0,0.0,0.0,1.0,0
2544263,-0.204937,0.653242,0.663627,-0.227855,-0.279,1.0,0.0,0.0,0.0,0.0,0


In [7]:
#Create the dataset -> We are working with clustering so we will create a training set after scaling
#We will take 70% of the data for training so 627 rows and the rest for trainining 269

X = fraud_df
X.shape

(10000, 11)

In [12]:
#Scale our X accordingly using MinMaxScaler
X_to_scale = ['oldbalanceOrg', 'newbalanceOrig', 'amount']

#Initialize the MinMax Scaler
scaler = MinMaxScaler()

#Fit the X data for our numerical columns
scaler.fit(X[X_to_scale])

#Transform the X data for both the X_train and X_test
X_scaled = scaler.transform(X[X_to_scale])

#Convert these back to dataframes so we can add them back in
X_scaled = pd.DataFrame(X_scaled, index=X.index, columns = X_to_scale)

KeyError: "None of [Index(['oldbalanceOrg', 'newbalanceOrig', 'amount'], dtype='object')] are in the [columns]"

In [13]:
#Add back in the columns
#All the other columns
X_other_columns = X.columns.drop(X_to_scale)

#Join from scaled to not-scaled
X = X_scaled.join(X[X_other_columns])

#Make sure we have no crazy join issues, we should have shape [1000, 11] 
print(X.shape)

KeyError: "['oldbalanceOrg', 'newbalanceOrig', 'amount'] not found in axis"

In [14]:
column_names = X.columns


In [15]:
X_train = X.iloc[:627]
X_test = X.iloc[627:]

In [None]:
# Separating features and target
X = fraud_df.drop('newbalanceOrig', axis=1)
y = fraud_df['isFraud']

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the models
linear_model = LinearRegression()
ridge_model = Ridge(alpha=1.0)
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Function to train and evaluate a model
def train_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Training the model
    model.fit(X_train, y_train)

    # Predicting the Test set results
    y_pred = model.predict(X_test)

    # Evaluating the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    return mse, rmse, r2

# Training and evaluating Linear Regression
linear_mse, linear_rmse, linear_r2 = train_evaluate_model(linear_model, X_train, y_train, X_test, y_test)

# Training and evaluating Ridge Regression
ridge_mse, ridge_rmse, ridge_r2 = train_evaluate_model(ridge_model, X_train, y_train, X_test, y_test)

# Training and evaluating Random Forest Regression
rf_mse, rf_rmse, rf_r2 = train_evaluate_model(random_forest_model, X_train, y_train, X_test, y_test)

# Print the results
print("Linear Regression: MSE =", linear_mse, ", RMSE =", linear_rmse, ", R^2 =", linear_r2)
print("Ridge Regression: MSE =", ridge_mse, ", RMSE =", ridge_rmse, ", R^2 =", ridge_r2)
print("Random Forest Regression: MSE =", rf_mse, ", RMSE =", rf_rmse, ", R^2 =", rf_r2)

In [None]:
coefficients = linear_model.coef_
feature_names = X.columns
coeff_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coeff_df = coeff_df.reindex(coeff_df.Coefficient.abs().sort_values(ascending=False).index)

In [None]:
coeff_df