<a href="https://colab.research.google.com/github/SanthoshPollai/CODSOFT/blob/main/CodeSoft_Task_2_CreditCardFraudDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Name : Pollai Santhosh**

# **Code Soft Task 2 :**

**CREDIT CARD FRAUD DETECTION**

Build a model to detect fraudulent credit card transactions. Use a
dataset containing information about credit card transactions, and
experiment with algorithms like Logistic Regression, Decision Trees, or Random Forests to classify transactions as fraudulent or legitimate.

In [19]:
# Import the required libraries and setup
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [20]:
# Load the dataset
df = pd.read_csv('/content/fraudTest.csv')

In [21]:
# Sample a subset of the data to reduce memory usage
df = df.sample(frac=0.1, random_state=42)  # Adjust frac to get the desired subset size

In [22]:
# DATA PREPROCESS
# Convert categorical variables to numeric using one-hot encoding
categorical_columns = ['category', 'job', 'gender']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [23]:
# Ensure all columns are numeric
non_numeric_columns = df.select_dtypes(include=['object']).columns
for col in non_numeric_columns:
    df[col] = df[col].astype('category').cat.codes

In [24]:
# Check for missing values and drop if any
df = df.dropna()

In [25]:
# Drop irrelevant or redundant columns
columns_to_drop = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long']
columns_to_drop = [col for col in columns_to_drop if col in df.columns]
df = df.drop(columns=columns_to_drop)

In [26]:
# FEATURE ENGINEERING
# Separate features and target variable
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

In [27]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
# MODEL SELECTION & TRAINING
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

In [29]:
# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

In [30]:
# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

In [31]:
# MODEL EVALUATION
# Logistic Regression Evaluation
def evaluate_logistic_regression():
    print("Logistic Regression:")
    print(classification_report(y_test, lr_predictions))
    print("Confusion Matrix:\n")
    print(confusion_matrix(y_test, lr_predictions))
    print(f"Accuracy: {accuracy_score(y_test, lr_predictions)}")
    print(f"Precision: {precision_score(y_test, lr_predictions)}")
    print(f"Recall: {recall_score(y_test, lr_predictions)}")
    print(f"F1 Score: {f1_score(y_test, lr_predictions)}")

In [32]:
evaluate_logistic_regression()

Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      1161
         1.0       0.00      0.00      0.00         6

    accuracy                           0.99      1167
   macro avg       0.50      0.50      0.50      1167
weighted avg       0.99      0.99      0.99      1167

Confusion Matrix:

[[1160    1]
 [   6    0]]
Accuracy: 0.9940017137960583
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


In [33]:
# Decision Tree Evaluation
def evaluate_decision_tree():
    dt_predictions = dt_model.predict(X_test)
    print("Decision Tree:\n")
    print(classification_report(y_test, dt_predictions))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, dt_predictions))
    print(f"Accuracy: {accuracy_score(y_test, dt_predictions)}")
    print(f"Precision: {precision_score(y_test, dt_predictions)}")
    print(f"Recall: {recall_score(y_test, dt_predictions)}")
    print(f"F1 Score: {f1_score(y_test, dt_predictions)}")

evaluate_decision_tree()

Decision Tree:

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1161
         1.0       0.40      0.33      0.36         6

    accuracy                           0.99      1167
   macro avg       0.70      0.67      0.68      1167
weighted avg       0.99      0.99      0.99      1167

Confusion Matrix:
[[1158    3]
 [   4    2]]
Accuracy: 0.9940017137960583
Precision: 0.4
Recall: 0.3333333333333333
F1 Score: 0.36363636363636365


In [34]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [35]:
# Random Forest Evaluation
def evaluate_random_forest():
    rf_predictions = rf_model.predict(X_test)
    print("Random Forest:\n")
    print(classification_report(y_test, rf_predictions))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, rf_predictions))
    print(f"Accuracy: {accuracy_score(y_test, rf_predictions)}")
    print(f"Precision: {precision_score(y_test, rf_predictions)}")
    print(f"Recall: {recall_score(y_test, rf_predictions)}")
    print(f"F1 Score: {f1_score(y_test, rf_predictions)}")

evaluate_random_forest()

Random Forest:

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      1161
         1.0       0.00      0.00      0.00         6

    accuracy                           0.99      1167
   macro avg       0.50      0.50      0.50      1167
weighted avg       0.99      0.99      0.99      1167

Confusion Matrix:
[[1161    0]
 [   6    0]]
Accuracy: 0.9948586118251928
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
def classify_transactions(model, new_data):
    new_data = pd.get_dummies(new_data, columns=categorical_columns, drop_first=True)
    new_data = new_data.reindex(columns=X.columns, fill_value=0)
    predictions = model.predict(new_data)
    new_data['Prediction'] = predictions
    new_data['Prediction'] = new_data['Prediction'].map({0: 'Legitimate', 1: 'Fraudulent'})

    return new_data

In [37]:
# Example usage with new data
new_transactions = pd.DataFrame({
    'category': ['food_dining', 'gas_transport', 'kids_pets', 'health_fitness', 'home'],
    'job': ['Doctor', 'Engineer', 'Police', 'Scientist', 'Buisness Analyst'],
    'gender': ['M', 'F', 'M', 'F', 'M']
})
classified_data = classify_transactions(lr_model, new_transactions)
print(classified_data)

   Unnamed: 0  merchant  amt  category_food_dining  category_gas_transport  \
0           0         0    0                     0                   False   
1           0         0    0                     0                    True   
2           0         0    0                     0                   False   
3           0         0    0                     0                   False   
4           0         0    0                     0                   False   

   category_grocery_net  category_grocery_pos  category_health_fitness  \
0                     0                     0                    False   
1                     0                     0                    False   
2                     0                     0                    False   
3                     0                     0                     True   
4                     0                     0                    False   

   category_home  category_kids_pets  ...  job_Volunteer coordinator  \
0          Fal