<a href="https://colab.research.google.com/github/SaddamRafiq/Model-Engineering/blob/main/UseCase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score






# Load the dataset
file_path = 'PSP_Jan_Feb_2019.xlsx'  # Replace with your file path
data = pd.read_excel(file_path)

# Display the first few rows
print(data.head())

# Display data info
print(data.info())

# Summary statistics
print(data.describe())


# Plot distribution of transaction amounts
plt.figure(figsize=(10, 6))
sns.histplot(data['amount'], kde=True)
plt.title('Distribution of Transaction Amounts')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.show()

# Plot success rate by PSP
plt.figure(figsize=(12, 6))
sns.barplot(x='PSP', y='success', data=data, estimator=lambda x: len(x) / len(data) * 100)
plt.title('Success Rate by PSP')
plt.xlabel('Payment Service Provider')
plt.ylabel('Success Rate (%)')
plt.xticks(rotation=45)
plt.show()



# Assuming 'data' is a DataFrame containing your dataset

# Select only numeric columns
numeric_data = data.select_dtypes(include=[float, int])

# Compute correlation matrix
correlation_matrix = numeric_data.corr()

# Plot heatmap of correlations
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title('Correlation Matrix')
plt.show()

# Drop redundant columns and handle missing values
data = data.drop(columns=['Unnamed: 0'])
data = data.drop_duplicates()
print(f"Data shape after cleaning: {data.shape}")

# Feature engineering - Extract transaction hour
data['transaction_hour'] = pd.to_datetime(data['tmsp']).dt.hour
data = data.drop(columns=['tmsp'])  # Drop original timestamp column
print(data.head())



# Encoding categorical variables
data_encoded = pd.get_dummies(data, columns=['country', 'PSP', 'card'], drop_first=True)

# Feature scaling for the 'amount' column
scaler = StandardScaler()
data_encoded['amount'] = scaler.fit_transform(data_encoded[['amount']])

# Display the first few rows after transformation
print(data_encoded.head())



# Define the target variable and features
X = data_encoded.drop('success', axis=1)
y = data_encoded['success']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Check the shapes of the splits
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")



# Train the baseline model
baseline_model = LogisticRegression(max_iter=1000, random_state=42)
baseline_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_baseline = baseline_model.predict(X_test)

# Evaluate the baseline model
print("Baseline Model Performance:")
print(classification_report(y_test, y_pred_baseline))



# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
print("Random Forest Model Performance:")
print(classification_report(y_test, y_pred_rf))



# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']  # Replaced 'auto' with 'sqrt'
}

# Assuming rf_model is a RandomForestClassifier instance
rf_model = RandomForestClassifier()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='f1')
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_rf_model = grid_search.best_estimator_
y_pred_tuned_rf = best_rf_model.predict(X_test)

# Evaluate the tuned Random Forest model
print("Tuned Random Forest Model Performance:")
print(classification_report(y_test, y_pred_tuned_rf))


# Confusion matrix
confusion = confusion_matrix(y_test, y_pred_tuned_rf)
print(f"Confusion Matrix:\n{confusion}")

# Key metrics
accuracy = accuracy_score(y_test, y_pred_tuned_rf)
precision = precision_score(y_test, y_pred_tuned_rf)
recall = recall_score(y_test, y_pred_tuned_rf)
f1 = f1_score(y_test, y_pred_tuned_rf)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Feature importance
importances = best_rf_model.feature_importances_
feature_names = X_train.columns

# Create a DataFrame for better visualization
feature_importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("Top 10 Feature Importances:")
print(feature_importances_df.head(10))

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importances_df['Feature'], feature_importances_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importances in Tuned Random Forest')
plt.show()