# About Datasets

# Context
Having card transaction in companies which are able to recognize the fraud credit transaction are very important in update technology nowday. Customers want a secure transaction for not falling into any scamms.


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
datasets = pd.read_csv('creditcard.csv')
datasets

In [None]:
datasets.info()

In [None]:
datasets.describe()

In [None]:
datasets['Class'].unique()

In [None]:
datasets['Class'].value_counts()

In [None]:
datasets['Amount'].value_counts()

In [None]:
datasets.columns

In [None]:
datasets['Time'].describe()

In [None]:
datasets['Amount'].describe()

In [None]:
datasets['Time'].describe()

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(datasets.corr(), cmap='coolwarm', annot=True, fmt=".2f")
plt.show()

In [None]:
corr_matrix = datasets.corr()

In [None]:
corr_matrix

In [None]:
#getting columns with the correlation less than th threshold which is 0.13
low_corr_cols = [col for col in corr_matrix.columns if all(abs(corr_matrix[col]) < 0.13)]

In [None]:
datasets.drop(columns=low_corr_cols, inplace=True)

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(datasets.corr(), cmap='coolwarm', annot=True, fmt=".2f")
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
ax = sns.heatmap(
    datasets.corr(),
    cmap='coolwarm',
    annot=True,
    fmt='.2f',
    annot_kws={"size": 10},  # Adjust annotation text size
    linewidths=0.5,  # Add lines between squares for better separation
    linecolor='black'  # Set the color of the lines
)
ax.set_title('Correlation Heatmap of Modified DataFrame', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(20, 20))
ax = ax.flatten()
fig.suptitle('Box Plot to Check for Outliers', fontsize=24)
for i, column in enumerate(datasets.columns):
    if i >= 9:
        break
    sns.boxplot(data=datasets, x=column, ax=ax[i])
    ax[i].set_title(column, fontsize=18) 
    ax[i].set_xlabel('') 

# Adjust layout for better spacing
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

In [None]:
X = datasets.drop('Class', axis=1)
y = datasets['Class']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
model = RandomForestClassifier(random_state=0)
model.fit(X_train_scaled, y_train)

In [None]:
#undersampling 
from collections import Counter
print(f'Original class distribution in y_train: {Counter(y_train)}')

In [None]:
from imblearn.under_sampling import RandomUnderSampler
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

In [None]:
print(f'Class distribution after undersampling: {Counter(y_train_resampled)}')

In [None]:
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test_scaled)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_train_scaled)

In [None]:
#Generate new features based on existing ones, such as combining columns or creating new metrics
datasets['Amount_per_Time'] = datasets['Amount'] / (datasets['Time'] + 1)

In [None]:
#Apply log transformation to handle skewed data
datasets['Log_Amount'] = np.log1p(datasets['Amount'])

In [None]:
#Create new features by multiplying correlated columns and then assess their importance
datasets['V1_V2'] = datasets['V1'] * datasets['V2']
datasets['V3_V4'] = datasets['V3'] * datasets['V4']

In [None]:
#After creating new features, you may want to drop features that are no longer important or redundant.
datasets.drop(columns=['V1', 'V2'], inplace=True)

In [None]:
importances = model.feature_importances_
important_features = [feature for feature, importance in zip(X.columns, importances) if importance > 0.01]
X_important = X[important_features]

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20]
}
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=0), param_grid=param_grid, cv=3)
grid_search.fit(X_train_resampled, y_train_resampled)
best_model = grid_search.best_estimator_

In [None]:
prediction = model.predict(X_test_scaled)