In [84]:
# This script installs the necessary Python packages for data analysis and visualization.
!pip install pandas numpy matplotlib seaborn missingno scipy



In [None]:
################ Import all necessary libraries and data

%pip install openpyxl

import pandas as pd
import glob
import re  # For extracting year from filename

# Initialize storage
combined_data = []

# Get all Excel files
file_paths = glob.glob("C:/Users/Lenovo/Desktop/Dissertaion/China Data/Sales/*.xlsx")  # Update path

for file in file_paths:
    # Extract year from filename (assuming format like "...2023.xlsx")
    year = re.search(r'\d{4}', file).group()  # Finds first 4-digit number
    
    # Read all sheets from current file
    sheets_dict = pd.read_excel(file, sheet_name=None)
    
    for sheet_name, df in sheets_dict.items():
        # Add identifier columns
        df['year'] = int(year)          # From filename
        df['type'] = sheet_name         # From sheet name
        
        combined_data.append(df)

# Combine all sales data
final_df = pd.concat(combined_data, ignore_index=True)

# Import Price Data
price_file = pd.read_excel(
	"C:/Users/Lenovo/Desktop/Dissertaion/China Data/Price 2015-202309.xlsx",
	sheet_name="中国汽车分车型每月销售量"
)

# Import Population Data
population_file = pd.read_excel(
    "C:/Users/Lenovo/Desktop/Dissertaion/China Data/Geographical Controls/(10)2000-2023年人口密度.xls",
    sheet_name=None
)

Note: you may need to restart the kernel to use updated packages.


In [89]:
################ Process Price Data
# Rename columns to English first
price_file = price_file.rename(columns=column_mapping)

# Standardize column names to lowercase with underscores
def standardize_columns(df):
    df.columns = (df.columns
                 .str.lower()  # Convert to lowercase
                 .str.replace(' ', '_')  # Replace spaces with underscores
                 .str.replace('-', '_'))  # Replace hyphens with underscores
    return df

# Apply standardization to both DataFrames
price_file = standardize_columns(price_file)

# Drop irrelevant columns (keep only columns we need)
price_file = price_file[columns_to_keep_en].copy()

# Convert price ranges to midpoints using the function from cell 2
price_file['price'] = price_file['price'].apply(price_range_to_midpoint)

# Calculate yearly weighted average price
def calculate_weighted_avg(group):
    total_sales = group['sales'].sum()
    weighted_sum = (group['sales'] * group['price']).sum()
    return weighted_sum / total_sales if total_sales != 0 else 0

yearly_weighted_prices = price_df.groupby(['model', 'year']).apply(calculate_weighted_avg).reset_index()
yearly_weighted_prices.columns = ['model', 'year', 'weighted_Avg_Price']

# Add total yearly sales for context
yearly_sales = price_df.groupby(['model', 'year'])['sales'].sum().reset_index()
yearly_weighted_prices = yearly_weighted_prices.merge(yearly_sales, on=['model', 'year'])

# Sort the results
yearly_weighted_prices = yearly_weighted_prices.sort_values(['model', 'year'])

# Display results
print(yearly_weighted_prices)

# Save to Excel
output_path = 'yearly_weighted_prices_by_model.xlsx'
yearly_weighted_prices.to_excel(output_path, index=False)
print(f"Results saved to {output_path}")

  yearly_weighted_prices = price_df.groupby(['model', 'year']).apply(calculate_weighted_avg).reset_index()


       model  year  weighted_Avg_Price  sales
0        212  2021               10.29   1433
1        212  2023               10.29   1037
2     AION S  2019               15.98  31929
3     AION S  2020               15.98  46091
4     AION S  2021               15.98  69220
...      ...   ...                 ...    ...
5348      魔方  2023               12.69   5871
5349       鲸  2022                0.00    463
5350       鲸  2023                0.00    170
5351     黑金刚  2015                0.00   1826
5352     黑金刚  2016                0.00    462

[5353 rows x 4 columns]
Results saved to yearly_weighted_prices_by_model.xlsx


In [91]:
################ Process Sales Data

# Translate variable names
# Manual translation
translation_map = {
    '省份': 'province',
    '品牌': 'brand',
    '车型': 'model',
    '燃料类型': 'fuel_type',
    '功率': 'power',
    '销量'  : 'sales',
    '总质量': 'mass',
}

# Apply translations
combined_df = final_df.rename(columns=translation_map)

# Standardize column names to lowercase with underscores
combined_df = standardize_columns(combined_df)

# Move 'year' and 'type' to the front
cols = ['year', 'type'] + [col for col in combined_df.columns if col not in ['year', 'type']]
combined_df = combined_df[cols]


# Converts data types
combined_df = combined_df.convert_dtypes()

# View variable names
print(combined_df.columns)


Index(['year', 'type', 'province', 'brand', 'model', 'fuel_type', 'mass',
       'power', 'sales'],
      dtype='object')


In [98]:
################ Combine Sales and Price Data

# Drop sales and month from price_df 
yearly_weighted_prices = yearly_weighted_prices.drop(columns=['sales'], errors='ignore')  # Ignore if 'sales' column doesn't exist

# Merge the datasets on model and year
merged_df = pd.merge(combined_df, yearly_weighted_prices, 
                    on=['model', 'year'], 
                    how='inner')  # Inner join to keep only matching models

# Verify if any models were dropped
original_models = set(combined_df['model'].unique())
merged_models = set(merged_df['model'].unique())
dropped_models = original_models - merged_models

if len(dropped_models) > 0:
    print(f"The following models were dropped due to missing price data: {dropped_models}")
else:
    print("All models had matching price data and were kept.")

# Save the combined data (optional)
merged_df.to_csv('combined_sales_price_data.csv', index=False)

# Display the first few rows of the combined data
print("\nCombined data preview:")
print(merged_df.head())

The following models were dropped due to missing price data: {'天马L4600', '经典帝豪', '欧朗', '风神AX7', 'Modern in', '赛力斯5 增程型', '伽途im8', '传祺AION Y', '大乘G60', '御风S16', '大通G90', '长安CS75PLUS', '小鹏IDENTY X', 'UNI-V', '力帆乐途S', '智骏GX5', '长安之星9 EV', '江淮A5', '吉利LX', '康迪K12', '领克02 HB', '北汽EC5', 'HiPhi Z', '比亚迪L3', '上汽大通D90', '江南U2', '江铃E100', '瑞风', '精灵#3', '秦EV', 'Q6', '氢舟eH2', '华泰XEV260', '几何T', 'ID.7 VIZZION', '科赛5', '威旺S50', '幻速S3L', '名爵7', '上汽大通EG10', '松散SS DOLPHIN', '长安V3', '野马T60', 'UR-V', '智骏GC1', '吉利SX7', '北京20', 'ARCFOX αT', '景逸X6', '俊风E11K', '领克08', '微蓝 6', '国金骏行', '王牌E.M7', '启辰R50', '江淮iEV5', 'EQC', '名爵Cyberster', 'Lumin', '永源A380', '帕萨特PHEV', '帝豪EV', '风行S50', 'E60 EV', '思铭M-NV', '江淮iEV7S', '小海狮X30L', '特斯拉Model 3', '华梓1号', 'E9 PHEV', 'XR-V', '狮跑', '奔驰EQE级', '长安CS35 mini', '传祺AION V', '荣威i6 MAX EV', '长安LUMIN', '远景SUV', '高尔夫嘉旅', '恒润HRS1', '悦翔V3', '唐', '致享', 'CS35PLUS', '秦PLUS EV', '坦克500 PHEV', 'X7 PHEV', '元Pro', '力帆乐途', '嘉悦X8', '小康K05S', '天越', '上汽大通G10', '英朗GT', 'MG3', '大乘E20', '幸福e+', '508

In [105]:
print(merged_df.head(10))

   year      type province brand   model  fuel_type  mass power  sales  \
0  2019  国产新能源乘用车      安徽省    奥迪   奥迪A6L  插电式汽油混合动力  2477   140      2   
1  2019  国产新能源乘用车      安徽省    奥迪   奥迪A6L  插电式汽油混合动力  2477   140      1   
2  2019  国产新能源乘用车      安徽省    奥迪   奥迪Q2L        纯电动  2090   100     21   
3  2019  国产新能源乘用车      安徽省    宝骏  宝骏E100        纯电动   990    29    375   
4  2019  国产新能源乘用车      安徽省    宝骏  宝骏E100        纯电动   999    29      5   
5  2019  国产新能源乘用车      安徽省    宝马    宝马5系  插电式汽油混合动力  2495    70    249   
6  2019  国产新能源乘用车      安徽省    宝马    宝马X1  插电式汽油混合动力  2340   100      2   
7  2019  国产新能源乘用车      安徽省    宝马    宝马X1  插电式汽油混合动力  2340    70     22   
8  2019  国产新能源乘用车      安徽省    奔腾   奔腾B30        纯电动  1838    90     37   
9  2019  国产新能源乘用车      安徽省    奔腾   奔腾X40        纯电动  1955   140      1   

   weighted_Avg_Price  
0              54.235  
1              54.235  
2              24.580  
3               5.230  
4               5.230  
5              49.950  
6              31

In [None]:
################ Process Population Data

In [None]:
################ Final Adjustments to Merged Data

# Translation of all observations

# Dealing with missing values, outliers and zeros
# Find number of zeros sales



# Calculating market share using population as total market size


0