In [None]:
import pandas as pd
import re

#引入數據檔案
cases_file_path = '/your file path/Malaria cases.csv'
deaths_file_path = '/your file path/Malaria deaths.csv'

cases_df = pd.read_csv(cases_file_path)
deaths_df = pd.read_csv(deaths_file_path)

#進行數據清理
def clean_numeric_data(value):
    if isinstance(value, str):
        value = re.sub(r'\[.*?\]', '', value).replace(' ', '') #移除原先檔案中括號 [] 及其內部的任何內容，並去除空格
        
        try:
            return int(value) #將處理後的字串轉換為 int 格式 
        except ValueError:
            return None  
   
    return value

#重構並清理資料表，取出地區名稱、年份、以及數值
def restructure_and_clean_data(df):
    region_column = df.iloc[:, 0] 
    year_columns = df.iloc[0, 1:] 
    numeric_data = df.iloc[1:, 1:].applymap(clean_numeric_data) 
    numeric_data.columns = year_columns
    numeric_data.insert(0, 'Region', region_column.iloc[1:].values)

    return numeric_data

cases_cleaned_df = restructure_and_clean_data(cases_df)
deaths_cleaned_df = restructure_and_clean_data(deaths_df)

cases_cleaned_file = '/your file path/Cleaned_Malaria_Cases.csv'
deaths_cleaned_file = '/your file path/Cleaned_Malaria_Deaths.csv'

#儲存清理後數據
cases_cleaned_df.to_csv(cases_cleaned_file, index=False)
deaths_cleaned_df.to_csv(deaths_cleaned_file, index=False)

cases_cleaned_file, deaths_cleaned_file


('/Users/wongyining/Desktop/大五上/用 Python與Excel 學習統計模型思維/期末專案/Cleaned_Malaria_Cases.csv',
 '/Users/wongyining/Desktop/大五上/用 Python與Excel 學習統計模型思維/期末專案/Cleaned_Malaria_Deaths.csv')

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

#將處理後資料拆解為區域名稱、年份和數值，準備建立模型
def prepare_regional_data(df):
    regions = df['Region'].values 
    years = df.columns[1:].astype(int).values.reshape(-1, 1) 
    data = df.iloc[:, 1:].astype(float).values 

    return regions, years, data

#建立模型，使用 Scikit-learn 的線性迴歸模型，其中 w_lin透過對 X 進行奇異值分解（SVD）進行計算
def predict_future_values(regions, years, data, future_years):

    predictions = {}

    for i, region in enumerate(regions):
        model = LinearRegression()
        model.fit(years, data[i, :])  
        predictions[region] = model.predict(future_years).astype(int).flatten()

    return predictions

future_years = np.arange(2022, 2027).reshape(-1, 1)

cases_regions, cases_years, cases_data = prepare_regional_data(cases_cleaned_df)
cases_predictions = predict_future_values(cases_regions, cases_years, cases_data, future_years)

deaths_regions, deaths_years, deaths_data = prepare_regional_data(deaths_cleaned_df)
deaths_predictions = predict_future_values(deaths_regions, deaths_years, deaths_data, future_years)

cases_predictions_df = pd.DataFrame(cases_predictions, index=future_years.flatten())
cases_predictions_df.index.name = 'Year'

deaths_predictions_df = pd.DataFrame(deaths_predictions, index=future_years.flatten())
deaths_predictions_df.index.name = 'Year'

cases_predictions_df = cases_predictions_df.clip(lower=0)
deaths_predictions_df = deaths_predictions_df.clip(lower=0)

cases_predictions_file = '/your file path/Predicted_Malaria_Cases.csv'
deaths_predictions_file = '/your file path/Predicted_Malaria_Deaths.csv'

cases_predictions_df.to_csv(cases_predictions_file)
deaths_predictions_df.to_csv(deaths_predictions_file)

cases_predictions_file, deaths_predictions_file


('/Users/wongyining/Desktop/大五上/用 Python與Excel 學習統計模型思維/期末專案/Predicted_Malaria_Cases.csv',
 '/Users/wongyining/Desktop/大五上/用 Python與Excel 學習統計模型思維/期末專案/Predicted_Malaria_Deaths.csv')