In [29]:
import pandas as pd
import requests
import glob
import csv
import os

API_URL = "http://127.0.0.1:8000"

output_file = 'combined_data.csv'
folder_path = 'good_data'

# 1. create directory, call twice create_directory("good_data"), create_directory("bad_data")
def create_directory(directory_name):
 
  if not os.path.exists(directory_name):
    os.mkdir(directory_name)
    print(f"Directory '{directory_name}' created successfully.")
  else:
    print(f"Directory '{directory_name}' already exists.")

# 2. Split the souce file into multiple and store in good_data folder: 
def split_csv(input_file, output_prefix, rows_per_file):

    with open(input_file, 'r') as infile:
        reader = csv.reader(infile)
        header = next(reader)  # Read the header row
        output_file = None
        row_count = 0
        file_count = 1

        for row in reader:
            if row_count == 0:
                output_file = open(os.path.join("good_data", f"{output_prefix}_{file_count}.csv"), 'w', newline='')
                writer = csv.writer(output_file)
                writer.writerow(header)
            writer.writerow(row)
            row_count += 1
            if row_count == rows_per_file:
                output_file.close()
                row_count = 0
                file_count += 1

        if output_file:
            output_file.close()
            
# 3. merge data from good_date folder to single file with some filters and create a new merged file
def get_data (file_path):
    # Define the columns you want to retain in the final CSV
    required_columns = ['merchant', 'category', 'amt', 'gender', 'lat', 'long', 'city_pop', 'job', 'unix_time', 'merch_lat', 'merch_long']
    
    # Initialize an empty list to store DataFrames for valid files
    dfs = []
    
    # Loop through all the CSV files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
    
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dfs.append(df)
            print(f"Processed file: {file_name}")
    # Concatenate all valid DataFrames into a single DataFrame
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        
        # Save the combined DataFrame to a new CSV file
        combined_df.to_csv(output_file, index=False)
        print(f"Combined data saved to: {output_file}")
    else:
        print("No files to combine.")

# 4. Predict the combined data from step 3.
def ingestion_predict (uploaded_file):
    # Open the file in binary mode
    with open(uploaded_file, 'rb') as f:
        # Send the POST request with the file
        response = requests.post(f"{API_URL}/predict-file/", files={"file": f})
    
    # Check if the request was successful
    if response.status_code == 200:
        result = response.json()  # Extract JSON content
        df = pd.DataFrame(result["predicted_data"])  # Convert to DataFrame
        fraud_count = df[df['is_fraud'] == 1].shape[0]
        print(f"Number of fraud cases: {fraud_count}")
    else:
        print(f"Error: {response.status_code} - {response.text}")

input_file = '../data/testData.csv'
output_prefix = 'split_data'
rows_per_file = 1000

create_directory("good_data") # create folder good_data
create_directory("bad_data") # create folder bad_data
split_csv(input_file, output_prefix, rows_per_file) # split and store the files in good_data atm.


get_data('good_data') # create a merged file from good_data folder, good_data is a folder name and 'combined_data.csv' is file created 

ingestion_predict(output_file) # predict the merged file and stored in database


Directory 'good_data' already exists.
Directory 'bad_data' already exists.
Processed file: split_data_1.csv
Processed file: split_data_10.csv
Processed file: split_data_11.csv
Processed file: split_data_12.csv
Processed file: split_data_13.csv
Processed file: split_data_14.csv
Processed file: split_data_15.csv
Processed file: split_data_16.csv
Processed file: split_data_17.csv
Processed file: split_data_18.csv
Processed file: split_data_19.csv
Processed file: split_data_2.csv
Processed file: split_data_20.csv
Processed file: split_data_21.csv
Processed file: split_data_22.csv
Processed file: split_data_23.csv
Processed file: split_data_24.csv
Processed file: split_data_25.csv
Processed file: split_data_26.csv
Processed file: split_data_27.csv
Processed file: split_data_28.csv
Processed file: split_data_29.csv
Processed file: split_data_3.csv
Processed file: split_data_30.csv
Processed file: split_data_4.csv
Processed file: split_data_5.csv
Processed file: split_data_6.csv
Processed fil

In [26]:
#uploaded_file = 'good_data/split_data_12.csv'
ingestion_predict(output_file)

Number of fraud cases: 2766
   Unnamed: 0 trans_date_trans_time            cc_num  \
0      300000   2020-10-12 14:06:46      571314334723   
1      300001   2020-10-12 14:07:07     4600155880464   
2      300002   2020-10-12 14:07:18  6011252220172077   
3      300003   2020-10-12 14:07:53  3595192916105588   
4      300004   2020-10-12 14:07:54  3546897637165774   

                            merchant        category     amt    first  \
0                fraud_Stamm-Witting    shopping_net    3.02     Lori   
1  fraud_Mante, Luettgen and Hackett  health_fitness   20.24  Charles   
2                fraud_Douglas-White   entertainment  114.16   Melvin   
3                 fraud_Predovic Inc    shopping_net    4.36   Dustin   
4              fraud_Kutch-Wilderman            home  100.35    Kayla   

       last gender                       street  ...      lat     long  \
0    Mclean      F   4548 Werner Wells Apt. 441  ...  39.8882 -79.8694   
1  Copeland      M               92213 Lee

In [25]:
get_data('good_data')

Processed file: split_data_1.csv
Processed file: split_data_10.csv
Processed file: split_data_11.csv
Processed file: split_data_12.csv
Processed file: split_data_13.csv
Processed file: split_data_14.csv
Processed file: split_data_15.csv
Processed file: split_data_16.csv
Processed file: split_data_17.csv
Processed file: split_data_18.csv
Processed file: split_data_19.csv
Processed file: split_data_2.csv
Processed file: split_data_20.csv
Processed file: split_data_21.csv
Processed file: split_data_22.csv
Processed file: split_data_23.csv
Processed file: split_data_24.csv
Processed file: split_data_25.csv
Processed file: split_data_26.csv
Processed file: split_data_27.csv
Processed file: split_data_28.csv
Processed file: split_data_29.csv
Processed file: split_data_3.csv
Processed file: split_data_30.csv
Processed file: split_data_4.csv
Processed file: split_data_5.csv
Processed file: split_data_6.csv
Processed file: split_data_7.csv
Processed file: split_data_8.csv
Processed file: split_

In [11]:
import os
import pandas as pd

# Define the folder containing the CSV files
folder_path = 'good_data'
output_file = 'combined_data.csv'

# Initialize an empty list to store DataFrames for valid files
dfs = []

# Loop through all the CSV files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)

        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Check if 'is_fraud' column exists, if yes, ignore this file
        if 'is_fraud' in df.columns:
            print(f"Ignored file: {file_name} (contains 'is_fraud' column)")
            continue

        # Append the DataFrame to the list
        dfs.append(df)
        print(f"Processed file: {file_name}")

# Concatenate all valid DataFrames into a single DataFrame
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)

    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv(output_file, index=False)
    print(f"Combined data saved to: {output_file}")
else:
    print("No files to combine.")


Ignored file: split_data_1.csv (contains 'is_fraud' column)
Ignored file: split_data_10.csv (contains 'is_fraud' column)
Ignored file: split_data_11.csv (contains 'is_fraud' column)
Ignored file: split_data_12.csv (contains 'is_fraud' column)
Ignored file: split_data_2.csv (contains 'is_fraud' column)
Ignored file: split_data_3.csv (contains 'is_fraud' column)
Ignored file: split_data_4.csv (contains 'is_fraud' column)
Ignored file: split_data_5.csv (contains 'is_fraud' column)
Ignored file: split_data_6.csv (contains 'is_fraud' column)
Ignored file: split_data_7.csv (contains 'is_fraud' column)
Ignored file: split_data_8.csv (contains 'is_fraud' column)
Ignored file: split_data_9.csv (contains 'is_fraud' column)
No files to combine.


In [12]:
import os
import pandas as pd

# Define the folder containing the CSV files
folder_path = 'good_data'
output_file = 'combined_data.csv'

# Define the columns you want to retain in the final CSV
required_columns = ['merchant', 'category', 'amt', 'gender', 'lat', 'long', 'city_pop', 'job', 'unix_time', 'merch_lat', 'merch_long']

# Initialize an empty list to store DataFrames for valid files
dfs = []

# Loop through all the CSV files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)

        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Ensure the DataFrame has all required columns, if not, raise an error or log a message
        missing_columns = set(required_columns) - set(df.columns)
        if missing_columns:
            print(f"Ignored file: {file_name} (missing columns: {missing_columns})")
            continue

        # Select only the required columns
        df_filtered = df[required_columns]

        # Append the filtered DataFrame to the list
        dfs.append(df_filtered)
        print(f"Processed file: {file_name}")

# Concatenate all valid DataFrames into a single DataFrame
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)

    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv(output_file, index=False)
    print(f"Combined data saved to: {output_file}")
else:
    print("No files to combine.")


Processed file: split_data_1.csv
Processed file: split_data_10.csv
Processed file: split_data_11.csv
Processed file: split_data_12.csv
Processed file: split_data_2.csv
Processed file: split_data_3.csv
Processed file: split_data_4.csv
Processed file: split_data_5.csv
Processed file: split_data_6.csv
Processed file: split_data_7.csv
Processed file: split_data_8.csv
Processed file: split_data_9.csv
Combined data saved to: combined_data.csv
