In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from math import radians, sin, cos, acos
import re
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from time import time
import datetime
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve as ROC, recall_score
import random
from sklearn.metrics import confusion_matrix as CM
from sklearn.metrics import accuracy_score as AS

In [2]:
data = pd.read_csv('weatherAUS.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [4]:
data.shape

(145460, 23)

In [5]:
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [6]:
(data.iloc[:,-1])

0          No
1          No
2          No
3          No
4          No
         ... 
145455     No
145456     No
145457     No
145458     No
145459    NaN
Name: RainTomorrow, Length: 145460, dtype: object

In [7]:
(data.iloc[:,-1].isna()).sum()

3267

In [8]:
data.dropna(subset=['RainTomorrow'],inplace=True)

In [9]:
data.reset_index(drop=True,inplace=True)

In [10]:
X = data.iloc[:,:-2]

In [11]:
y = data.iloc[:,-1]

In [12]:
X.isnull().mean(axis=0)

Date             0.000000
Location         0.000000
MinTemp          0.004480
MaxTemp          0.002265
Rainfall         0.009888
Evaporation      0.427890
Sunshine         0.476929
WindGustDir      0.065615
WindGustSpeed    0.065193
WindDir9am       0.070418
WindDir3pm       0.026570
WindSpeed9am     0.009480
WindSpeed3pm     0.018496
Humidity9am      0.012476
Humidity3pm      0.025388
Pressure9am      0.098556
Pressure3pm      0.098324
Cloud9am         0.377353
Cloud3pm         0.401525
Temp9am          0.006358
Temp3pm          0.019171
dtype: float64

In [13]:
np.unique(y)

array(['No', 'Yes'], dtype=object)

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X,y,test_size=0.3,random_state=420)

In [15]:
for i in [X_train, X_test, Y_train, Y_test]:
    i.reset_index(drop=True,inplace=True)

In [16]:
X_train

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
0,2015-05-08,Uluru,8.9,23.2,0.0,,,E,46.0,E,...,26.0,19.0,52.0,27.0,1025.0,1020.7,,,14.4,21.8
1,2016-10-16,Walpole,8.8,15.0,4.4,,,WNW,35.0,W,...,15.0,19.0,66.0,71.0,1024.5,1023.4,,,13.5,13.8
2,2013-12-06,Cobar,8.1,23.6,0.0,10.0,,WSW,48.0,S,...,17.0,19.0,35.0,15.0,1017.4,1016.3,0.0,0.0,14.4,21.9
3,2011-01-24,SalmonGums,15.7,26.7,3.0,,,S,52.0,ENE,...,19.0,33.0,77.0,63.0,,,,,19.4,22.5
4,2013-03-28,Mildura,15.3,23.9,10.4,9.6,6.1,SW,41.0,W,...,6.0,22.0,93.0,40.0,1019.5,1019.5,8.0,7.0,16.2,21.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99530,2008-08-27,Melbourne,7.4,15.3,0.0,2.6,5.8,SSW,30.0,WSW,...,9.0,19.0,67.0,60.0,1030.3,1029.2,6.0,6.0,10.0,13.5
99531,2013-04-09,CoffsHarbour,14.7,23.5,1.6,3.8,9.4,ENE,35.0,SW,...,17.0,9.0,71.0,65.0,1025.2,1022.3,6.0,7.0,20.8,22.5
99532,2012-05-07,Albury,4.0,14.5,0.0,,,ESE,20.0,WSW,...,4.0,11.0,99.0,68.0,1021.1,1017.5,8.0,5.0,8.0,13.8
99533,2016-02-11,Sale,13.9,24.7,0.0,,,ESE,33.0,W,...,11.0,19.0,73.0,56.0,1017.8,1015.5,3.0,2.0,18.8,22.6


In [17]:
Y_test.value_counts()

No     33078
Yes     9580
Name: RainTomorrow, dtype: int64

In [18]:
Y_train.value_counts()

No     77238
Yes    22297
Name: RainTomorrow, dtype: int64

In [19]:
#编码标签并映射返回
encorder = LabelEncoder().fit(Y_train)

Y_train = pd.DataFrame(encorder.fit_transform(Y_train))
Y_test = pd.DataFrame(encorder.fit_transform(Y_test))

In [20]:
Y_train

Unnamed: 0,0
0,0
1,1
2,0
3,1
4,0
...,...
99530,0
99531,1
99532,0
99533,0


In [21]:
#写入文件，在此之前只将y的缺失值所在行删除，其余未做处理
write = False

if write == True:
    X_train.to_csv('X_train.csv')
    X_test.to_csv('X_test.csv')
    Y_train.to_csv('Y_train.csv')
    Y_test.to_csv('Y_test.csv')

In [22]:
X_train.describe([0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,99092.0,12.179886,6.404463,-8.5,-1.809,1.8,3.9,7.6,12.0,16.8,20.8,25.8,31.9
MaxTemp,99292.0,23.212779,7.110535,-4.1,9.1,12.8,14.5,17.9,22.6,28.2,32.9,40.1,48.1
Rainfall,98535.0,2.348739,8.438048,0.0,0.0,0.0,0.0,0.0,0.0,0.6,6.0,37.4,367.6
Evaporation,56898.0,5.455923,4.164123,0.0,0.4,1.0,1.4,2.6,4.8,7.4,10.2,18.206,86.2
Sunshine,51968.0,7.609452,3.782961,0.0,0.0,0.3,1.5,4.8,8.4,10.6,12.0,13.4,14.3
WindGustSpeed,93045.0,39.988285,13.624007,6.0,15.0,20.0,24.0,31.0,39.0,48.0,57.0,81.0,135.0
WindSpeed9am,98591.0,14.006319,8.922327,0.0,0.0,0.0,4.0,7.0,13.0,19.0,26.0,39.0,130.0
WindSpeed3pm,97698.0,18.633841,8.814635,0.0,2.0,6.0,9.0,13.0,19.0,24.0,30.0,43.0,83.0
Humidity9am,98284.0,68.845265,19.07418,0.0,17.0,34.0,44.0,57.0,70.0,83.0,94.0,100.0,100.0
Humidity3pm,96953.0,51.497798,20.776461,0.0,9.0,17.0,23.0,37.0,52.0,66.0,79.0,98.0,100.0


In [23]:
X_test.describe([0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,42464.0,12.201599,6.400577,-7.6,-1.8,1.8,4.0,7.6,12.0,16.9,20.8,25.8,33.9
MaxTemp,42579.0,23.259442,7.134084,-4.8,9.0,12.8,14.5,18.0,22.7,28.2,33.0,40.3,47.0
Rainfall,42252.0,2.352854,8.528195,0.0,0.0,0.0,0.0,0.0,0.0,0.8,6.0,37.498,371.0
Evaporation,24452.0,5.502172,4.24471,0.0,0.4,1.0,1.4,2.6,4.8,7.4,10.2,18.6,145.0
Sunshine,22409.0,7.660569,3.778036,0.0,0.0,0.4,1.6,4.9,8.5,10.7,12.1,13.4,14.5
WindGustSpeed,39878.0,39.974974,13.506464,7.0,15.0,20.0,24.0,31.0,39.0,48.0,57.0,80.0,135.0
WindSpeed9am,42254.0,13.991882,8.825421,0.0,0.0,0.0,4.0,7.0,13.0,19.0,26.0,39.0,87.0
WindSpeed3pm,41865.0,18.646292,8.777042,0.0,2.0,6.0,9.0,13.0,19.0,24.0,30.0,43.0,87.0
Humidity9am,42135.0,68.840418,18.998023,3.0,18.0,34.0,44.0,57.0,70.0,83.0,94.0,100.0,100.0
Humidity3pm,41630.0,51.447226,20.847527,1.0,9.0,16.0,23.0,36.0,52.0,66.0,79.0,97.0,100.0


In [24]:
(X_test['Cloud9am'] > 8.0).sum()

1

In [25]:
(X_test['Cloud3pm'] > 8.0).sum()

1

In [26]:
(X_train['Cloud3pm'] > 8.0).sum()

0

In [27]:
X_train.shape

(99535, 21)

In [28]:
x1 = X_train[X_train['Cloud9am'] > 8.0].index
x2 = X_train[X_train['Cloud3pm'] > 8.0].index
X_train.drop(x1,inplace=True)
Y_train.drop(x1,inplace=True)
X_train.drop(x2,inplace=True)
Y_train.drop(x2,inplace=True)
x3 = X_test[X_test['Cloud9am'] > 8.0].index
x4 = X_test[X_test['Cloud3pm'] > 8.0].index
X_test.drop(x3,inplace=True)
Y_test.drop(x3,inplace=True)
X_test.drop(x4,inplace=True)
Y_test.drop(x4,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(x1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(x2,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(x3,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(x4,inplace=True)


In [29]:
X_test.shape

(42656, 21)

In [30]:
X_train.loc[X_train['Rainfall'] >= 1.0,'RainToday'] = 1
X_train.loc[X_train['Rainfall'] < 1.0,'RainToday'] = 0
X_train.loc[X_train['Rainfall'] == np.nan,'RainToday'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[X_train['Rainfall'] >= 1.0,'RainToday'] = 1


In [31]:
X_test.loc[X_test['Rainfall'] >= 1.0,'RainToday'] = 1
X_test.loc[X_test['Rainfall'] < 1.0,'RainToday'] = 0
X_test.loc[X_test['Rainfall'] == np.nan,'RainToday'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.loc[X_test['Rainfall'] >= 1.0,'RainToday'] = 1


In [32]:
X_train['Date'] = X_train['Date'].apply(lambda x : int(x.split('-')[1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Date'] = X_train['Date'].apply(lambda x : int(x.split('-')[1]))


In [33]:
X_test['Date'] = X_test['Date'].apply(lambda x : int(x.split('-')[1]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Date'] = X_test['Date'].apply(lambda x : int(x.split('-')[1]))


In [34]:
X_train = X_train.rename(columns={'Date':'Month'})

In [35]:
X_test = X_test.rename(columns={'Date':'Month'})

In [36]:
X_train

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,5,Uluru,8.9,23.2,0.0,,,E,46.0,E,...,19.0,52.0,27.0,1025.0,1020.7,,,14.4,21.8,0.0
1,10,Walpole,8.8,15.0,4.4,,,WNW,35.0,W,...,19.0,66.0,71.0,1024.5,1023.4,,,13.5,13.8,1.0
2,12,Cobar,8.1,23.6,0.0,10.0,,WSW,48.0,S,...,19.0,35.0,15.0,1017.4,1016.3,0.0,0.0,14.4,21.9,0.0
3,1,SalmonGums,15.7,26.7,3.0,,,S,52.0,ENE,...,33.0,77.0,63.0,,,,,19.4,22.5,1.0
4,3,Mildura,15.3,23.9,10.4,9.6,6.1,SW,41.0,W,...,22.0,93.0,40.0,1019.5,1019.5,8.0,7.0,16.2,21.2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99530,8,Melbourne,7.4,15.3,0.0,2.6,5.8,SSW,30.0,WSW,...,19.0,67.0,60.0,1030.3,1029.2,6.0,6.0,10.0,13.5,0.0
99531,4,CoffsHarbour,14.7,23.5,1.6,3.8,9.4,ENE,35.0,SW,...,9.0,71.0,65.0,1025.2,1022.3,6.0,7.0,20.8,22.5,1.0
99532,5,Albury,4.0,14.5,0.0,,,ESE,20.0,WSW,...,11.0,99.0,68.0,1021.1,1017.5,8.0,5.0,8.0,13.8,0.0
99533,2,Sale,13.9,24.7,0.0,,,ESE,33.0,W,...,19.0,73.0,56.0,1017.8,1015.5,3.0,2.0,18.8,22.6,0.0


In [37]:
city_ll = pd.read_csv('cityll.csv',index_col=0)
city_climate = pd.read_csv('Cityclimate.csv')

In [38]:
city_ll['Latitude'] = city_ll['Latitude'].apply(lambda x : float(x[:-1]))
city_ll['Longitude'] = city_ll['Longitude'].apply(lambda x : float(x[:-1]))

In [39]:
city_ll = city_ll.iloc[:,0:3]

In [40]:
city_ll['climate'] = city_climate.iloc[:,-1]

In [41]:
city_ll['climate'].value_counts()

Hot dry summer, cool winter          24
Warm temperate                       18
Hot dry summer, warm winter          18
High humidity summer, warm winter    17
Mild temperate                        9
Cool temperate                        9
Warm humid summer, mild winter        5
Name: climate, dtype: int64

In [42]:
city_ll

Unnamed: 0,City,Latitude,Longitude,climate
0,Adelaide,34.9285,138.6007,Warm temperate
1,Albany,35.0275,117.8840,Mild temperate
2,Albury,36.0737,146.9135,"Hot dry summer, cool winter"
3,Wodonga,36.1241,146.8818,"Hot dry summer, cool winter"
4,AliceSprings,23.6980,133.8807,"Hot dry summer, warm winter"
...,...,...,...,...
95,Wollongong,34.4278,150.8931,Warm temperate
96,Wyndham,15.4825,128.1228,"High humidity summer, warm winter"
97,Yalgoo,28.3445,116.6851,"Hot dry summer, cool winter"
98,Yulara,25.2335,130.9849,"Hot dry summer, warm winter"


In [43]:
samplecity = pd.read_csv('samplecity.csv',index_col=0)

In [44]:
samplecity.head()

Unnamed: 0,City,Latitude,Longitude,Latitudedir,Longitudedir
0,Canberra,35.2809°,149.1300°,"S,",E
1,Sydney,33.8688°,151.2093°,"S,",E
2,Perth,31.9505°,115.8605°,"S,",E
3,Darwin,12.4634°,130.8456°,"S,",E
4,Hobart,42.8821°,147.3272°,"S,",E


In [45]:
samplecity['Latitude'] = samplecity['Latitude'].apply(lambda x : float(x[:-1]))
samplecity['Longitude'] = samplecity['Longitude'].apply(lambda x : float(x[:-1]))

In [46]:
city_ll['lat_l'] = city_ll.iloc[:,1].apply(lambda x : radians(x))
city_ll['log_l'] = city_ll.iloc[:,2].apply(lambda x : radians(x))
samplecity['lat_n'] = samplecity.iloc[:,1].apply(lambda x : radians(x))
samplecity['log_n'] = samplecity.iloc[:,2].apply(lambda x : radians(x))

In [47]:
for i in range(samplecity.shape[0]):
    lat_l = city_ll.loc[:,'lat_l']
    log_l = city_ll.loc[:,'log_l']
    lat_n = samplecity.loc[i,'lat_n']
    log_n = samplecity.loc[i,'log_n']
    dist = 6371.01 * np.arccos(np.sin(lat_l)*np.sin(lat_n) + np.cos(lat_l)*np.cos(lat_n)*np.cos(log_l - log_n))
    #np.argsort：将array从小到大排列，输出其索引
    city_index = np.argsort(dist)[0]
    samplecity.loc[i,'near_city'] = city_ll.loc[city_index,'City']
    samplecity.loc[i,'Climate'] = city_ll.loc[city_index,'climate']

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [48]:
local_rst = samplecity.iloc[:,[0,-1]]

In [49]:
local_rst = local_rst.rename(columns={'City':'Location'})

In [50]:
local_rst.set_index(keys='Location',inplace=True)

In [51]:
# 将气候映射到城市
X_train['Location'] = X_train['Location'].map(local_rst['Climate']).apply(lambda x: re.sub(',','',x.strip()))
X_test['Location'] = X_test['Location'].map(local_rst['Climate']).apply(lambda x: re.sub(',','',x.strip()))

In [52]:
X_train.rename(columns={'Location':'Climate'},inplace=True)
X_test.rename(columns={'Location':'Climate'},inplace=True)

In [53]:
cate = X_train.columns[X_train.dtypes == 'object'].tolist()

In [54]:
cate += ['RainToday','Cloud9am','Cloud3pm']

In [55]:
cate

['Climate',
 'WindGustDir',
 'WindDir9am',
 'WindDir3pm',
 'RainToday',
 'Cloud9am',
 'Cloud3pm']

In [56]:
si = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
si = si.fit(X_train.loc[:,cate])

In [57]:
X_train.loc[:,cate] = si.transform(X_train.loc[:,cate])
X_test.loc[:,cate] = si.transform(X_test.loc[:,cate])

In [58]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99534 entries, 0 to 99534
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Month          99534 non-null  int64  
 1   Climate        99534 non-null  object 
 2   MinTemp        99091 non-null  float64
 3   MaxTemp        99291 non-null  float64
 4   Rainfall       98534 non-null  float64
 5   Evaporation    56898 non-null  float64
 6   Sunshine       51967 non-null  float64
 7   WindGustDir    99534 non-null  object 
 8   WindGustSpeed  93044 non-null  float64
 9   WindDir9am     99534 non-null  object 
 10  WindDir3pm     99534 non-null  object 
 11  WindSpeed9am   98590 non-null  float64
 12  WindSpeed3pm   97697 non-null  float64
 13  Humidity9am    98283 non-null  float64
 14  Humidity3pm    96952 non-null  float64
 15  Pressure9am    89709 non-null  float64
 16  Pressure3pm    89731 non-null  float64
 17  Cloud9am       99534 non-null  float64
 18  Cloud3

In [59]:
oe = OrdinalEncoder()

In [60]:
oe = oe.fit(X_test.loc[:,cate])

In [61]:
X_train.loc[:,cate] = oe.transform(X_train.loc[:,cate])
X_test.loc[:,cate] = oe.transform(X_test.loc[:,cate])

In [62]:
col = X_train.columns.to_list()

In [63]:
for i in cate:
    col.remove(i)

In [64]:
col

['Month',
 'MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Temp9am',
 'Temp3pm']

In [65]:
impmean = SimpleImputer(missing_values=np.nan,strategy='mean').fit(X_train.loc[:,col])

In [66]:
X_train.loc[:,col] = impmean.transform(X_train.loc[:,col])
X_test.loc[:,col] = impmean.transform(X_test.loc[:,col])

In [67]:
col.remove('Month')

In [68]:
ss = StandardScaler().fit(X_train.loc[:,col])

In [69]:
X_train.loc[:,col] = ss.transform(X_train.loc[:,col])
X_test.loc[:,col] = ss.transform(X_test.loc[:,col])

In [70]:
X_train.head()

Unnamed: 0,Month,Climate,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,5.0,3.0,-0.513295,-0.001812,-0.279763,-2.821095e-16,-3.249304e-16,0.0,0.456382,0.0,...,0.041918,-0.888743,-1.194718,1.089578,0.8156536,7.0,7.0,-0.398308,0.018149,0.0
1,10.0,4.0,-0.528945,-1.156449,0.244324,-2.821095e-16,-3.249304e-16,14.0,-0.378703,13.0,...,0.041918,-0.1501,0.951081,1.015465,1.220093,7.0,7.0,-0.537379,-1.147633,1.0
2,12.0,2.0,-0.638489,0.054512,-0.279763,1.443322,-3.249304e-16,15.0,0.608216,8.0,...,0.041918,-1.785666,-1.779936,-0.03694371,0.1565679,0.0,0.0,-0.398308,0.032721,0.0
3,1.0,2.0,0.550852,0.491021,0.077569,-2.821095e-16,-3.249304e-16,8.0,0.911883,1.0,...,1.645058,0.430261,0.560936,-1.685141e-14,-1.70294e-14,7.0,7.0,0.374312,0.120154,1.0
4,3.0,2.0,0.488256,0.096755,0.958989,1.316271,-0.5522152,12.0,0.076798,13.0,...,0.385448,1.274424,-0.560732,0.2743321,0.635903,8.0,7.0,-0.120165,-0.069285,1.0


In [71]:
Y_train = Y_train.iloc[:,0].ravel()
Y_test = Y_test.iloc[:,0].ravel()

In [75]:
#下面的代码运行时间过长，因此选取前5000行
X_train_part = X_train.iloc[0:5000,:]
Y_train_part = Y_train[0:5000]
X_test_part = X_test.iloc[0:5000,:]
Y_test_part = Y_test[0:5000]

'''
times = time()

for kernel in ['linear','poly','rbf','sigmoid']:
    clf = SVC(kernel=kernel
             ,gamma='auto'
             ,degree=1
             ,cache_size=20000
             ).fit(X_train,Y_train)
    result = clf.predict(X_test)
    score = clf.score(X_test,Y_test)
    recall = recall_score(Y_test,result)
    auc = roc_auc_score(Y_test,clf.decision_function(X_test))
    print('%s 的分数为 %f,此时的recall为 %f,此时的auc面积为 %f' % (kernel,score,recall,auc))
    print('用时：%s' % datetime.datetime.fromtimestamp(time()-times).strftime('%M:%S:%f'))
'''

"\ntimes = time()\n\nfor kernel in ['linear','poly','rbf','sigmoid']:\n    clf = SVC(kernel=kernel\n             ,gamma='auto'\n             ,degree=1\n             ,cache_size=20000\n             ).fit(X_train,Y_train)\n    result = clf.predict(X_test)\n    score = clf.score(X_test,Y_test)\n    recall = recall_score(Y_test,result)\n    auc = roc_auc_score(Y_test,clf.decision_function(X_test))\n    print('%s 的分数为 %f,此时的recall为 %f,此时的auc面积为 %f' % (kernel,score,recall,auc))\n    print('用时：%s' % datetime.datetime.fromtimestamp(time()-times).strftime('%M:%S:%f'))\n"

In [74]:
times = time()

for kernel in ['linear','poly','rbf','sigmoid']:
    clf = SVC(kernel=kernel
             ,gamma='auto'
             ,degree=1
             ,cache_size=20000
             ).fit(X_train_part,Y_train_part)
    result = clf.predict(X_test_part)
    score = clf.score(X_test_part,Y_test_part)
    recall = recall_score(Y_test_part,result)
    auc = roc_auc_score(Y_test_part,clf.decision_function(X_test_part))
    print('%s 的分数为 %f,此时的recall为 %f,此时的auc面积为 %f' % (kernel,score,recall,auc))
    print('用时：%s' % datetime.datetime.fromtimestamp(time()-times).strftime('%M:%S:%f'))

linear 的分数为 0.839600,此时的recall为 0.448490,此时的auc面积为 0.868583
用时：00:09:206167
poly 的分数为 0.839800,此时的recall为 0.439609,此时的auc面积为 0.867783
用时：00:10:998851
rbf 的分数为 0.823400,此时的recall为 0.312611,此时的auc面积为 0.818015
用时：00:18:260526
sigmoid 的分数为 0.647600,此时的recall为 0.159858,此时的auc面积为 0.443368
用时：00:20:431452


In [79]:
clf = SVC(kernel='linear'
         ,gamma='auto'
         ,cache_size=20000
         ).fit(X_train_part,Y_train_part)
result = clf.predict(X_test_part)
cm = CM(Y_test_part,result,labels=(1,0))

In [80]:
cm

array([[ 505,  621],
       [ 181, 3693]], dtype=int64)

In [81]:
specificity = cm[1,1]/cm[1,:].sum()

In [82]:
specificity

0.9532782653588022

In [86]:
clf = SVC(kernel='linear'
         ,cache_size=20000
         ).fit(X_train_part,Y_train_part)

In [101]:
result = clf.predict(X_test_part)
score = clf.score(X_test_part,Y_test_part)

prob = pd.DataFrame(clf.decision_function(X_test_part))

prob.loc[prob.iloc[:,0] >= thresholds[maxindex],'y_pred'] = 1
prob.loc[prob.iloc[:,0] < thresholds[maxindex],'y_pred'] = 0

In [88]:
FPR, recall, thresholds = ROC(Y_test_part,clf.decision_function(X_test_part),pos_label=1)

In [98]:
maxindex = (recall - FPR).tolist().index((recall - FPR).max())

In [100]:
thresholds[maxindex]

-0.9643592943330122

In [112]:
score = AS(Y_test_part,prob.loc[:,'y_pred'])
recall = recall_score(Y_test_part,prob.loc[:,'y_pred'])

In [113]:
recall

0.8001776198934281

支持向量机（SVM）经历了7天左右的时间正式结束啦！机器学习最普遍最有用的算法告一段落了，希望明天再接再厉！转行成功！