In [None]:
from pathlib import Path
from dotenv import load_dotenv
import numpy as np
import os
import pandas as pd
import sys
import yfinance as yf
from src.cleaning import *

load_dotenv()
RAW = Path(os.getenv("DATA_DIR_RAW","data/raw"))
PROC = Path(os.getenv("DATA_DIR_PROCESSED","data/processed"))
src_path = Path("./src")
RAW.mkdir(parents=True,exist_ok=True)
PROC.mkdir(parents=True,exist_ok=True)
src_path.mkdir(parents=True,exist_ok=True)

In [None]:
#loading sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}
df_sample = pd.DataFrame(data)
df_sample.to_csv(f"{RAW}/sample.csv",index=False)

In [9]:
df_sample

Unnamed: 0,age,income,score,zipcode,city,extra_data
0,34.0,55000.0,0.82,90210,Beverly,
1,45.0,,0.91,10001,New York,42.0
2,29.0,42000.0,,60614,Chicago,
3,50.0,58000.0,0.76,94103,SF,
4,38.0,,0.88,73301,Austin,
5,,,0.65,12345,Unknown,5.0
6,41.0,49000.0,0.79,94105,San Francisco,


**Try the cleaning functions to see if it works**

In [None]:
#try drop_missing function
drop_missing(df_sample,threshold=0.9)

In [None]:
drop_missing(df_sample,columns=["income","score"])

In [None]:
#try fiilling_missing median function
filling_missing_median(df_sample)

In [None]:
#try normalize_data function
normalize_data(df_sample,method="minmax")

In [None]:
normalize_data(df_sample,method="standard")

**Cleaning sample data**

In [16]:
df_processed = df_sample.drop(columns = ["extra_data"])
df_processed = drop_missing(df_processed,threshold=0.9)
df_processed = filling_missing_median(df_processed)
df_processed = normalize_data(df_processed,method="standard")
df_processed

Unnamed: 0,age,income,score,zipcode,city
0,-0.797325,0.730096,-0.196116,90210,Beverly
1,0.797325,0.132745,1.568929,10001,New York
2,-1.522165,-1.858425,-0.196116,60614,Chicago
3,1.522165,1.327447,-1.372813,94103,SF
4,-0.217452,0.132745,0.980581,73301,Austin
6,0.217452,-0.464606,-0.784465,94105,San Francisco


In [19]:
#save processed data
df_processed.to_csv(f"{PROC}/sample_proc.csv",index=False)