# Data Analytics Project - Preprocessing

# Cleaning Data

In [1]:
# importing required libraries
import os

### Extracting CSV files from folders

In [2]:
companies_folders = os.listdir("../Storage/Companies")

print()
print("Number of companies folders:", len(companies_folders))

# Checking if this step was completed earlier
if not os.path.exists("../Storage/Companies_csvs/"):

    # Making a folder of all csvs
    os.mkdir("../Storage/Companies_csvs/")

    # Extracting csvs present in each companies folder into companies_csv folder for ease of use
    for company_folder in companies_folders:
        # print(company_folder)
        if os.path.isdir(f"../Storage/Companies/{company_folder}"):
            for csv_file in os.listdir(f"../Storage/Companies/{company_folder}"):
                os.rename(f"../Storage/Companies/{company_folder}/{csv_file}", f"../Storage/Companies_csvs/{csv_file}")
                # print("        " + csv_file)
            os.rmdir(f"../Storage/Companies/{company_folder}")

companies_csv_files = os.listdir("../Storage/Companies_csvs")

print("After processing ...")
print("Number of companies csv files:", len(companies_csv_files))
print()



Number of companies folders: 3792
After processing ...
Number of companies csv files: 4993



### Handling missing data

We are dropping rows of the datasets which have missing data.

Interpolation would not work in this scenario as filling in the data for missing days may lead to discrepancies and might affect the mean reverting nature of larger stocks and increase volatility.


In [3]:
# In each dataset, we are dropping rows with NaN values

companies_csvs = os.listdir("../Storage/Companies_csvs")

print("Number of companies csv files:", len(companies_csvs))

if not os.path.exists("../Storage/Companies_drop_rows/"):

    # Making a folder with cleaned csvs
    os.mkdir("../Storage/Companies_drop_rows/")

    # Dropping rows of all csvs with NaN values
    for company_csv in companies_csvs:
        df = pd.read_csv(f"../Storage/Companies_csvs/{company_csv}", index_col=[0])
        df = df.dropna()
        df.to_csv(f"../Storage/Companies_drop_rows/{company_csv}")

    # For debuging purposes only
    # os.rmdir("../Storage/Companies_drop_rows/")

companies_drop_rows = os.listdir("../Storage/Companies_drop_rows")

print("After processing ...")

print("Number of companies csv files cleaned:", len(companies_drop_rows))


Number of companies csv files: 4993
After processing ...
Number of companies csv files cleaned: 4993


### Cropping all datasets to be in the range of 2017-2019 (Included)

In [4]:
# In each dataset, we are checking if there is adequate data and 
# removing data that doesn't fit in the range of years 2017-2019

companies_drop_rows = os.listdir("../Storage/Companies_drop_rows/")

print()

print("Number of companies csv files with dropped rows:", len(companies_drop_rows))

if not os.path.exists("../Storage/Companies_in_range/"):

    # Making a folder with csvs in above specified range
    os.mkdir("../Storage/Companies_in_range/")

    # Dropping rows of all csvs with NaN values
    for cleaned_company_csv in companies_drop_rows:
        df = pd.read_csv(f"../Storage/Companies_drop_rows/{cleaned_company_csv}", index_col=[0])
        # print()
        # print(cleaned_company_csv)
        if df.shape[0] > 1000:
            # print(df.head())
            dates_in_range = list(map(lambda date: date[:4] in ["2017", "2018", "2019"], df.index))
            df = df[dates_in_range]
            # print(df.head())
            df.to_csv(f"../Storage/Companies_in_range/{cleaned_company_csv}")

    # For debuging purposes only
    # os.rmdir("../Storage/Companies_in_range/")

companies_in_range = os.listdir("../Storage/Companies_in_range/")

print("After processing ...")

print("Number of companies csv files cleaned:", len(companies_in_range))

print()




Number of companies csv files with dropped rows: 4993
After processing ...
Number of companies csv files cleaned: 3558



### Adding Company name and Exchange columns

In [5]:
# In each dataset, we are adding the company name and exchange columns
# to make it easier to identify rows in the dataframes later

companies_in_range = os.listdir("../Storage/Companies_in_range/")

print()

print("Number of companies csv files with dropped rows:", len(companies_in_range))

if not os.path.exists("../Storage/Companies_with_names_exchange/"):

    # Making a folder with csvs in above specified range
    os.mkdir("../Storage/Companies_with_names_exchange/")

    # Dropping rows of all csvs with NaN values
    for company_in_range_csv in companies_in_range:
        df = pd.read_csv(f"../Storage/Companies_in_range/{company_in_range_csv}", index_col=[0])
        # print()
        # print(cleaned_company_csv)
#         print()
#         print("csv: ", company_in_range_csv)
#         print("Company name: ", company_in_range_csv[:-7])
#         print("Company exchange: ", company_in_range_csv[-7:-4])
        df["Company"] = company_in_range_csv[:-7]
        df["Exchange"] = company_in_range_csv[-7:-4]
        # print(df.head())
        df.to_csv(f"../Storage/Companies_with_names_exchange/{company_in_range_csv}")

    # For debuging purposes only
    # os.rmdir("../Storage/Companies_with_names_exchange/")

companies_with_names_exchange = os.listdir("../Storage/Companies_with_names_exchange/")

print("After processing ...")

print("Number of companies csvs with names and exchanges added:", len(companies_with_names_exchange))

print()



Number of companies csv files with dropped rows: 3558
After processing ...
Number of companies csvs with names and exchanges added: 3558

