In [67]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
import pandas as pd
import numpy as np

file_path = '/content/drive/My Drive/DataScience/Data/cleaned_data_weather.csv'
df = pd.read_csv(file_path)

In [69]:
df_original_data = df

In [70]:
df['next_1_day_mean_temp'] = df_original_data['mean_temp'].shift(-1)
df['next_2_day_mean_temp'] = df_original_data['mean_temp'].shift(-2)
df['next_3_day_mean_temp'] = df_original_data['mean_temp'].shift(-3)

In [71]:
df.dropna(inplace=True)

In [72]:
del df['date']

#### cloud_cover processing

In [73]:
df['cloud_cover'].value_counts()

7.0    3116
6.0    2952
5.0    2292
8.0    1903
4.0    1815
3.0    1148
2.0     860
1.0     603
0.0     375
5.8       2
6.8       2
5.4       2
7.4       2
9.0       2
5.6       1
6.2       1
6.6       1
7.2       1
Name: cloud_cover, dtype: int64

In [74]:
condition = (df['cloud_cover'] % 1 != 0) | (df['cloud_cover'] > 8.0)

# 从DataFrame中去除满足条件的行
df = df[~condition]

In [75]:
df['cloud_cover'].value_counts()

7.0    3116
6.0    2952
5.0    2292
8.0    1903
4.0    1815
3.0    1148
2.0     860
1.0     603
0.0     375
Name: cloud_cover, dtype: int64

#### change the sunshine into hours

In [76]:
df['sunshine'] = df['sunshine'] * 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sunshine'] = df['sunshine'] * 60


#### change snow depth into a category value

In [77]:
df['snow_depth'].value_counts()

0.0     14933
1.0        45
2.0        23
4.0        15
3.0        15
5.0         6
6.0         4
7.0         4
8.0         4
10.0        2
0.4         2
12.0        2
11.0        2
9.0         1
18.0        1
13.0        1
15.0        1
16.0        1
22.0        1
0.8         1
Name: snow_depth, dtype: int64

In [78]:
df.loc[df['snow_depth'] > 0, 'snow_depth'] = 1.0

In [79]:
df['snow_depth'].value_counts()

0.0    14933
1.0      131
Name: snow_depth, dtype: int64

#### change the precipitation into category value¶

In [80]:
df['precipitation'].value_counts()

0.00     7907
0.20     1005
0.40      452
0.10      313
0.60      297
         ... 
53.10       1
18.70       1
25.40       1
20.00       1
0.46        1
Name: precipitation, Length: 261, dtype: int64

In [81]:
bins = [-float('inf'), 0.1, 2.5, 10, 50, float('inf')]
labels = [0, 1, 2, 3, 4]

df['precipitation'] = pd.cut(df['precipitation'], bins=bins, labels=labels, right=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['precipitation'] = pd.cut(df['precipitation'], bins=bins, labels=labels, right=False)


In [82]:
df['precipitation'].value_counts()

0    7907
1    4251
2    2335
3     567
4       4
Name: precipitation, dtype: int64

In [83]:
df['cloud_cover'] = df['cloud_cover'].astype('category')
df['snow_depth'] = df['snow_depth'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cloud_cover'] = df['cloud_cover'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['snow_depth'] = df['snow_depth'].astype('category')


In [84]:
df.dtypes

cloud_cover             category
sunshine                 float64
global_radiation         float64
max_temp                 float64
mean_temp                float64
min_temp                 float64
precipitation           category
pressure                 float64
snow_depth              category
next_1_day_mean_temp     float64
next_2_day_mean_temp     float64
next_3_day_mean_temp     float64
dtype: object

In [85]:
from sklearn.preprocessing import MinMaxScaler

features_to_normalize = ['sunshine', 'global_radiation', 'max_temp', 'mean_temp', 'min_temp', 'pressure']

scaler = MinMaxScaler()

df.loc[:, features_to_normalize] = scaler.fit_transform(df.loc[:, features_to_normalize])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, features_to_normalize] = scaler.fit_transform(df.loc[:, features_to_normalize])


In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr


X = df.drop(['next_1_day_mean_temp','next_2_day_mean_temp','next_3_day_mean_temp'], axis=1)
y = df[['next_1_day_mean_temp', 'next_2_day_mean_temp','next_3_day_mean_temp']]



# 划分训练集为新的训练集和验证集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 划分训练集为新的训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)




In [87]:
# 打印划分后的数据集大小
print("train dataset:", X_train.shape)
print("validation dataset:", X_val.shape)
print("test dataset:", X_test.shape)

train dataset: (9640, 9)
validation dataset: (2411, 9)
test dataset: (3013, 9)


In [94]:
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error


svr_model = MultiOutputRegressor(SVR(kernel = 'linear', C = 63.63768013, gamma = 6.70900599, epsilon = 0.63022038, verbose = 3))
svr_model.fit(X_train, y_train)

y_pred = svr_model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)

print('mse:', mse)



[LibSVM][LibSVM][LibSVM]mse: 3.8013648049948343


In [None]:
!pip install geneticalgorithm



In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from geneticalgorithm import geneticalgorithm as ga



varbound = np.array([          [0, 3],              # kernel
                     [0.01, 100],            # C
                     [0.001, 10],            # gamma
                     [0.01, 10]             # epsilon
                     ])


# 创建遗传算法对象
algorithm_param = {'max_num_iteration': 5, 'population_size': 5, 'elit_ratio': 0.01,
                   'parents_portion': 0.3, 'crossover_probability': 0.5, 'mutation_probability': 0.1,
                   'crossover_type': 'uniform', 'max_iteration_without_improv': 15}
model = ga(function=SVR_fitness_function, dimension=4, variable_type='real',
           variable_type_mixed  = np.array(['int', 'real', 'real', 'real']) ,variable_boundaries=varbound,
           algorithm_parameters=algorithm_param, function_timeout = 500)

# 运行遗传算法进行优化
model.run()

# 获得优化的超参数组合
best_params = model.output_dict['variable']

[LibSVM][LibSVM][LibSVM][LibSVM]

In [None]:

# Root Mean Squared Error
rmse = np.sqrt(mse)

# Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)

# Median Absolute Error
mdae = median_absolute_error(y_test, y_pred)

# Mean Absolute Percentage Error
diff = np.abs((y_test - y_pred) / y_test)
diff.replace([np.inf, -np.inf], np.nan, inplace=True)
diff.dropna(inplace=True)
mape = np.mean(np.mean(diff)*100)

# Root Mean Square Percentage Error
n = y_test.shape[1]
rmspe_scores = []
for i in range(n):
    y_obs = y_test.iloc[:, i]
    y_predict = y_pred[:, i]
    diff = ((y_obs - y_predict) / y_obs) ** 2
    diff.replace([np.inf, -np.inf], np.nan, inplace=True)
    diff.dropna(inplace=True)
    rmspe = np.sqrt(np.mean(diff)) * 100
    rmspe_scores.append(rmspe)
rmspe = np.mean(rmspe_scores)

# Pearson's correlation coefficient
correlations = []
for i in range(y_test.shape[1]):
    pearson_corr, _ = pearsonr(y_test.iloc[:, i], y_pred[:,i])
    correlations.append(pearson_corr)
mean_pearson_corr = np.mean(correlations)

# Index of Agreement
n = y_test.shape[1]
ia_scores = []
for i in range(n):
    y_obs = y_test.iloc[:, i]
    y_predict = y_pred[:,i]
    y_obs_avg = np.mean(y_obs)

    numerator = np.sum((y_predict - y_obs)**2)
    denominator = np.sum((y_obs_avg - y_obs)**2)
    ia = 1 - (numerator / denominator)
    ia_scores.append(ia)
d = np.mean(ia_scores)


print('Root Mean Squared Error:', rmse)
print('Mean Absolute Error:', mae)
print('Median Absolute Error:', mdae)
print('Mean Absolute Percentage Error:',mape)
print('Root Mean Square Percentage Error:', rmspe)
print('Pearson\'s correlation coefficient:', mean_pearson_corr)
print('Index of Agreement:', d)

Mean Squared Error: 4.290025462993694
Root Mean Squared Error: 2.0712376645362776
Mean Absolute Error: 1.5648120367297267
Median Absolute Error: 1.3133333333333328
Mean Absolute Percentage Error: 28.66935238309287
Root Mean Square Percentage Error: 125.72583179498376
Pearson's correlation coefficient: 0.9318600313952342
Index of Agreement: 0.8691557780429614


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
