In [1]:
pip install --upgrade scikit-learn imbalanced-learn


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV

# Step 1: Data Collection
data = pd.read_csv('C:/Users/shriy/Downloads/creditcard.csv')

# Step 2: Exploratory Data Analysis (EDA)
print(data.head())
print(data.info())
print(data.describe())

# Step 3: Data Cleaning
# Handle missing values
data.dropna(inplace=True)

# Step 4: Dealing with Imbalanced Data
# Separate features and target variable
X = data.drop('Time', axis=1)
y = data['Time']


# Specify the number of neighbors based on the size of the minority class
n_neighbors = min(5, sum(y == 1))  # Adjust 5 to a suitable value

# Apply SMOTE for balancing the data
smote = SMOTE(random_state=42, k_neighbors=n_neighbors)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Apply SMOTE for balancing the data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 5: Feature Engineering
# Perform feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

# Step 6: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

# Step 7: Choose Evaluation Metrics
# For imbalanced data, consider using metrics like precision, recall, F1-score, and AUC-ROC curve

# Step 8: Model Selection, Training, Predicting, and Assessment
# Initialize Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)

# Train the model
rf_clf.fit(X_train, y_train)

# Make predictions
y_pred = rf_clf.predict(X_test)

# Assess model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Step 9: Hyperparameter Tuning/Model Improvement
# Perform GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

#Step 10
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

# Fit the model on the entire training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

#Step 11
import pickle

# Save the trained model
with open('credit_card_fraud_detection_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Define API endpoints for model deployment
# Example using Flask:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    # Preprocess input data (if necessary)
    # Make predictions using the trained model
    prediction = model.predict(data)
    return jsonify({'prediction': prediction})

if __name__ == '__main__':
    app.run(debug=True)




   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 3, n_samples_fit = 2, n_samples = 2

In [None]:
print(data.columns)


In [None]:
#Visualization 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('C:/Users/shriy/Downloads/creditcard.csv')

# Exploratory Data Analysis (EDA)

# Data Quality Check

# Show the distribution of the target variable (fraudulent vs. non-fraudulent transactions)
plt.figure(figsize=(8, 6))
sns.countplot(x='Time', data=data)
plt.title('Distribution of Target Variable')
plt.xlabel('Time')
plt.ylabel('Count')
plt.show()

# Check for missing values in the dataset
plt.figure(figsize=(10, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values in the Dataset')
plt.show()

# Summary Statistics

# Display summary statistics for numerical features
plt.figure(figsize=(10, 6))
sns.boxplot(data=data.drop('Time', axis=1))
plt.title('Summary Statistics for Numerical Features')
plt.ylabel('Values')
plt.show()

# Visualize the distribution of categorical features
plt.figure(figsize=(10, 6))
sns.countplot(x='category', data=data)
plt.title('Distribution of Categorical Feature (Category)')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

# Identification of Patterns and Trends

# Use scatter plots to explore relationships between numerical features
plt.figure(figsize=(10, 6))
sns.pairplot(data=data, hue='Time')
plt.title('Relationships Between Numerical Features')
plt.show()

# Visualize the correlation matrix heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

# Plot time series plots to explore temporal patterns in the data
plt.figure(figsize=(10, 6))
sns.lineplot(x='date', y='amount', data=data, hue='Time')
plt.title('Temporal Patterns in Transaction Amount')
plt.xlabel('Date')
plt.ylabel('Amount')
plt.show()


In [None]:
# Show the distribution of the target variable (fraudulent vs. non-fraudulent transactions)
plt.figure(figsize=(8, 6))
sns.countplot(x='Time', data=data)
plt.title('Distribution of Target Variable')
plt.xlabel('Time')
plt.ylabel('Count')
plt.show()
