In [1]:
# Import libraries
import os
import time
from pytrends.request import TrendReq
import pandas as pd
import random
import statistics
import csv
import re
import ast
import numpy as np

In [2]:
def normalize_time_series(csv_file_path, output_csv_path):
    # Read the CSV files into an array
    csv_files = [os.path.join(csv_file_path, file) for file in os.listdir(csv_file_path) if file.endswith('.csv')]
    unique_tags = []
    final_df = pd.DataFrame()

    # Read and normalize all the csvs, file by file, and add the results if column name is not already present
    for file in csv_files:
        df = pd.read_csv(file, sep=';', index_col=False)
        cols_to_add = []

        ## Make the column names lowercase
        df.columns = df.columns.str.lower()
        normalized_df = df.groupby("date", group_keys=False).apply(perform_normalization)
        # Drop columns with the base name 'air travel'
        air_travel_columns = [col for col in normalized_df.columns if 'air travel' in col]
        normalized_df.drop(columns=air_travel_columns, inplace=True)
        for col in normalized_df.columns:
             if col not in unique_tags:
                  cols_to_add.append(col)
                  unique_tags.append(col)
        final_df = pd.concat([final_df, normalized_df[cols_to_add]], axis=1)

    final_df.to_csv(output_csv_path, sep=';', index=False)
    print(f"CSV file saved: {output_csv_path}")
    
def perform_normalization(batch_df):
        date = batch_df.iloc[:, 0]
        batch_df.drop('ispartial', axis=1, inplace=True) # Drop ispartial column
        batch_len = len(batch_df.columns)
        batch_df.iloc[:, 1:batch_len] = batch_df.iloc[:, 1:batch_len].replace(0, 1)
        tags_to_normalize = batch_df.iloc[:, 1:batch_len]
        normalized_tags = tags_to_normalize.divide(tags_to_normalize.iloc[0, -1], axis=1)
        normalized_tags = pd.concat([date, normalized_tags], axis=1)
        return normalized_tags

### Normalize data for <b>2022</b>

In [3]:
csv_file_path = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\raw\pytrends\raw_results_2022"
output_csv_path = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2022\normalized_results_2022.csv"
normalize_time_series(csv_file_path, output_csv_path)

CSV file saved: C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2022\normalized_results_2022.csv


### Normalize data for <b>2023</b>

In [4]:
csv_file_path = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\raw\pytrends\raw_results_2023"
output_csv_path = r"C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2023\normalized_results_2023.csv"
normalize_time_series(csv_file_path, output_csv_path)

CSV file saved: C:\School\Semester_1\Data_Wrangling\Data_in_the_wild_exam\data\stage\pytrends\normalized_results_2023\normalized_results_2023.csv


In [46]:
str = "date.1"
col_name = re.sub(r'\.(\d+)$|\.(\d+)\b', '', str)
print(col_name)

date
