In [None]:
# Merge the two DataFrames (outer join to retain all rows)
import pandas as pd

df1 = pd.read_csv("2017_GSM_parsed.csv", delimiter=",", header="infer")
print(df1.columns)
df2 = pd.read_csv("2024_GSM_parsed.csv", delimiter=",", header="infer")
print(df2.columns)

merged_df = pd.concat([df1, df2])

# Initialize a final DataFrame to store the cleaned rows
final_rows = []

# Group by the 'id' column to compare duplicates
for id, group in merged_df.groupby('id'):
    if len(group) == 1:
        # Case 1: Unique or Identical Row (only one row in group)
        final_rows.append(group.iloc[0])
    else:
        # Case 2: Multiple Rows (Duplicates found)
        combined_row = group.iloc[0].copy()  # Start with the first row
        conflicting_rows = []

        for _, row in group.iterrows():
            for col in row.index:
                if pd.isna(combined_row[col]) and not pd.isna(row[col]):
                    # Fill missing values
                    combined_row[col] = row[col]
                elif combined_row[col] != row[col] and not pd.isna(combined_row[col]) and not pd.isna(row[col]):
                    # Conflict detected
                    conflicting_rows.append(row)

        # If no conflicts, keep the combined row
        if not conflicting_rows:
            final_rows.append(combined_row)
        else:
            # Keep all conflicting rows with unique ids
            counter = 1
            combined_row['id'] = f"{id} ({counter})"
            final_rows.append(combined_row)
            
            print(conflicting_rows)

            for conflict_row in conflicting_rows:
                counter += 1
                conflict_row = conflict_row.copy()
                conflict_row['id'] = f"{id} ({counter})"
                final_rows.append(conflict_row)

# Combine all rows back into a DataFrame
merged_df = pd.DataFrame(final_rows)

# Drop intermediate '_df1' and '_df2' suffixes, if created during merge
merged_df.columns = [col.split('_')[0] for col in merged_df.columns]

# Display final DataFrame
print(merged_df)
#duplicates = {'2g_bands':'2g', '3g_bands':'3g', '4g_bands':'4g', 'network_speed', 'gprs', 'edge', 'announced', 'status', 'dimentions':'dimensions', 'sim', 'display_type', 'display_resolution', 'display_size', 'os', 'cpu', 'chipset', 'gpu', 'memory_card':'memory(external)', 'internal_memory':'memory(internal)', 'ram', 'primary_camera', 'secondary_camera', 'loud_speaker', 'audio_jack', 'wlan', 'bluetooth', 'gps', 'nfc', 'radio', 'usb', 'sensors', 'battery', 'colors', 'approx_price_eur', 'img_url'}

Index(['id', 'brand', 'model', 'date', 'year', 'display_size_inches',
       'internal_memory_in_gb', 'primary_camera_mega_pixel', 'primary_camera',
       'loud_speaker', 'gps', 'colors', 'approx_price_eur', 'battery_mah',
       'ram_in_gb'],
      dtype='object')
Index(['id', 'brand', 'model', 'date', 'year', 'display_size_inches',
       'internal_memory_in_gb', 'primary_camera_mega_pixel', 'primary_camera',
       'loud_speaker', 'gps', 'colors', 'approx_price_eur', 'battery_mah',
       'ram_in_gb'],
      dtype='object')


In [12]:
# scale
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the CSV into a DataFrame
file_path = '2017_GSM_parsed.csv'
scaled_file_path = '../2_joined_data/2017_GSM_scaled.csv'
df = pd.read_csv(file_path)

# Identify numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Apply StandardScaler to the numerical columns and add them as new columns
scaler = StandardScaler()
scaled_cols = [col + '_scale' for col in numerical_cols]
scaled_data = scaler.fit_transform(df[numerical_cols])

# Create a new DataFrame with scaled columns
scaled_df = pd.DataFrame(scaled_data, columns=scaled_cols, index=df.index)

# Combine original and scaled columns
for orig_col, scaled_col in zip(numerical_cols, scaled_cols):
    # Find the position of the original column
    col_idx = df.columns.get_loc(orig_col)
    # Insert the scaled column right after the original column
    df.insert(col_idx + 1, scaled_col, scaled_df[scaled_col])

# Step 4 (Optional): Save the scaled data back to a new CSV file
df.to_csv(scaled_file_path, index=False)

print(f"Scaled data saved to {scaled_file_path}")

Scaled data saved to ../2_joined_data/2017_GSM_scaled.csv


In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

file_path = '2017_GSM_parsed.csv'
scaled_file_path = '../2_joined_data/2017_GSM_scaled.csv'
df = pd.read_csv(file_path)

# Identify numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = numerical_cols[numerical_cols != 'year']

# Apply StandardScaler within each group
group_col = 'year'
def scale_group(group):
    scaler = StandardScaler()
    group[numerical_cols] = scaler.fit_transform(group[numerical_cols])
    return group

# Step 5: Apply scaling within each group
df = df.groupby(group_col).apply(scale_group)

df.to_csv(scaled_file_path, index=False)

print(f"Grouped scaled data saved to {scaled_file_path}")

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  df = df.groupby(group_col).apply(scale_group)


Grouped scaled data saved to ../2_joined_data/2017_GSM_scaled.csv


In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# File paths
file_path = '2017_GSM_parsed.csv'
scaled_file_path = '../2_joined_data/2017_GSM_scaled.csv'

# Load the CSV into a DataFrame
df = pd.read_csv(file_path)

# Identify numerical columns, excluding 'year'
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = numerical_cols[numerical_cols != 'year']

# Apply scaling within each group while retaining original columns
group_col = 'year'

def scale_group(group):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(group[numerical_cols])
    scaled_cols = [col + '_scale' for col in numerical_cols]
    scaled_df = pd.DataFrame(scaled_data, columns=scaled_cols, index=group.index)
    for orig_col, scaled_col in zip(numerical_cols, scaled_cols):
        # Insert scaled columns right after the original columns
        col_idx = group.columns.get_loc(orig_col)
        group.insert(col_idx + 1, scaled_col, scaled_df[scaled_col])
    return group

# Apply scaling to each group by the specified column (e.g., 'year')
df = df.groupby(group_col, group_keys=False).apply(scale_group)

# Save the final DataFrame to a CSV file
df.to_csv(scaled_file_path, index=False)

print(f"Grouped scaled data saved to {scaled_file_path}")

Grouped scaled data saved to ../2_joined_data/2017_GSM_scaled.csv


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  df = df.groupby(group_col, group_keys=False).apply(scale_group)
