In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv("./data/fraudTrain.csv")

In [4]:
# Età dalla data di nascita
df['dob'] = pd.to_datetime(df['dob'])
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365

# Data transazione: estrai ora, giorno, giorno della settimana
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek

# Rimuovi colonne inutili o troppo granulari
df = df.drop(columns=[
    'trans_date_trans_time', 'dob', 'first', 'last', 'street', 'trans_num'
])


In [5]:
df.head(10)

Unnamed: 0,index,cc_num,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud,age,hour,day_of_week
0,0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,36.011293,-82.048315,0,30,0,1
1,1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1325376044,49.159047,-118.186462,0,40,0,1
2,2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,M,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1325376051,43.150704,-112.154481,0,56,0,1
3,3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1325376076,47.034331,-112.561071,0,52,0,1
4,4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1325376186,38.674999,-78.632459,0,32,0,1
5,5,4767265376804500,"fraud_Stroman, Hudson and Erdman",gas_transport,94.63,F,Dublin,PA,18917,40.375,-75.2045,2158,Transport planner,1325376248,40.653382,-76.152667,0,57,0,1
6,6,30074693890476,fraud_Rowe-Vandervort,grocery_net,44.54,F,Holcomb,KS,67851,37.9931,-100.9893,2691,Arboriculturist,1325376282,37.162705,-100.15337,0,25,0,1
7,7,6011360759745864,fraud_Corwin-Collins,gas_transport,71.65,M,Edinburg,VA,22824,38.8432,-78.6003,6018,"Designer, multimedia",1325376308,38.948089,-78.540296,0,71,0,1
8,8,4922710831011201,fraud_Herzog Ltd,misc_pos,4.27,F,Manor,PA,15665,40.3359,-79.6607,1472,Public affairs consultant,1325376318,40.351813,-79.958146,0,77,0,1
9,9,2720830304681674,"fraud_Schoen, Kuphal and Nitzsche",grocery_pos,198.39,F,Clarksville,TN,37040,36.522,-87.349,151785,Pathologist,1325376361,37.179198,-87.485381,0,44,0,1


In [6]:
categorical_cols = ['gender', 'category', 'state', 'job']
numeric_cols = ['amt', 'age', 'hour', 'day_of_week', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']
target_col = 'is_fraud'


In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)


In [2]:
import torch
print(torch.cuda.is_available())  # True se la GPU è disponibile
print(torch.cuda.device_count())  # Numero di GPU

False
0


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = df[categorical_cols + numeric_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline.fit(X_train, y_train)
print("Accuracy:", pipeline.score(X_test, y_test))
