In [2]:
# Merge the two DataFrames (outer join to retain all rows)
import pandas as pd

df1 = pd.read_csv("2017_GSM_parsed.csv", delimiter=",", header="infer")
print(df1.columns)
df2 = pd.read_csv("2024_GSM_parsed.csv", delimiter=",", header="infer")
print(df2.columns)

merged_df = pd.concat([df1, df2])

# Initialize a final DataFrame to store the cleaned rows
final_rows = []

# Group by the 'id' column to compare duplicates
for id, group in merged_df.groupby('id'):
    if len(group) == 1:
        # Case 1: Unique or Identical Row (only one row in group)
        final_rows.append(group.iloc[0])
    else:
        # Case 2: Multiple Rows (Duplicates found)
        combined_row = group.iloc[0].copy()  # Start with the first row
        conflicting_rows = []

        for _, row in group.iterrows():
            for col in row.index:
                if pd.isna(combined_row[col]) and not pd.isna(row[col]):
                    # Fill missing values
                    combined_row[col] = row[col]
                elif combined_row[col] != row[col] and not pd.isna(combined_row[col]) and not pd.isna(row[col]):
                    # Conflict detected
                    conflicting_rows.extend([combined_row.to_dict(), row.to_dict()])
                    print(f"{combined_row.id}, {col}: \n{combined_row[col]}\n{row[col]}\n")
        # print if there were conflicts
        #if conflicting_rows:
            #print(pd.DataFrame(conflicting_rows))
            #print()
        # Add the combined row to the lits of valid rows
        final_rows.append(combined_row)
        
# Combine all rows back into a DataFrame
merged_df = pd.DataFrame(final_rows)



Index(['id', 'brand', 'model', 'date', 'year', 'display_size_inches',
       'internal_memory_in_gb', 'primary_camera_mega_pixel', 'primary_camera',
       'loud_speaker', 'gps', 'colors', 'approx_price_eur', 'battery_mah',
       'ram_in_gb'],
      dtype='object')
Index(['id', 'brand', 'model', 'date', 'year', 'display_size_inches',
       'internal_memory_in_gb', 'primary_camera_mega_pixel', 'primary_camera',
       'loud_speaker', 'gps', 'colors', 'approx_price_eur', 'battery_mah',
       'ram_in_gb'],
      dtype='object')
Apple iPhone 3G, internal_memory_in_gb: 
16.0
8.0

Apple iPhone 3G, gps: 
Yes with A-GPS
GPS, A-GPS

Apple iPhone 3G, colors: 
Black(8/16 GB)| White (16 GB)
Black(8/16 GB), White (16 GB)

Apple iPhone 3GS, internal_memory_in_gb: 
32.0
8.0

Apple iPhone 3GS, primary_camera: 
3.15 MP| f/2.8| autofocus| 
3.15 MP, f/2.8, AF

Apple iPhone 3GS, gps: 
Yes with A-GPS
GPS, A-GPS

Apple iPhone 3GS, colors: 
Black| White 
Black, White 

Apple iPhone 4, internal_memory_in_g

In [8]:
# scale
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the CSV into a DataFrame
#file_path = '2017_GSM_parsed.csv'
#df = pd.read_csv(file_path)
df = merged_df.copy(deep=True)

scaled_file_path = '../2_joined_data/2017_2024_GSM_scaled.csv'

# Identify numerical columns, excluding 'year'
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = numerical_cols[numerical_cols != 'year']

# Apply scaling within each group while retaining original columns
group_col = 'year'

def scale_group(group):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(group[numerical_cols])
    scaled_cols = [col + '_scale' for col in numerical_cols]
    scaled_df = pd.DataFrame(scaled_data, columns=scaled_cols, index=group.index)
    for orig_col, scaled_col in zip(numerical_cols, scaled_cols):
        # Insert scaled columns right after the original columns
        col_idx = group.columns.get_loc(orig_col)
        group.insert(col_idx + 1, scaled_col, scaled_df[scaled_col])
    return group

# Apply scaling to each group by the specified column (e.g., 'year')
df = df.groupby(group_col, group_keys=False).apply(scale_group)
df = df.sort_values(['year', 'id'])

# Save the final DataFrame to a CSV file
df.to_csv(scaled_file_path, index=False)

print(f"Grouped scaled data saved to {scaled_file_path}")

Grouped scaled data saved to ../2_joined_data/2017_2024_GSM_scaled.csv


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  df = df.groupby(group_col, group_keys=False).apply(scale_group)


6414
6414
