# Analysis of Historical Windspeeds in Ireland

Downloading data in the command line using ``wget -O`` from [data.gov.ie](https://data.gov.ie/organization/meteireann?q=wind&sort=score+desc%2C+metadata_modified+desc)

I chose 5 dataframes to represent regions North, South, East, West and the Midlands. These being: Malin head, Roches Point, Dublin airport, Belmullet and Mullingar, respectively.

The aim is for the analysis to be as comprehensive as possible whilst retaining manageability. 

In [82]:
# import necessary libraries
import pandas as pd 

## Loading and Inspecting the Data sets

Before I begin I will need to review each data set as there are sometimes different columns and different start dates for each. I will also need to inspect for missing values and clean up the data before creating my analysis.

### Roches Point (South)

In [83]:
filepath1 = r"C:\Users\joann\Desktop\PFDA-project\dataframes\roches_point.csv"
df_south = pd.read_csv(filepath1, skiprows= 24)
df_south.head()

Unnamed: 0,date,ind,maxtp,ind.1,mintp,igmin,gmin,ind.2,rain,cbl,...,ddhm,ind.5,hg,soil,pe,evap,smd_wd,smd_md,smd_pd,glorad
0,01-dec-1955,0,10.9,0,5.9,,,0,9.1,1004.2,...,170,0,24,,,,,,,
1,02-dec-1955,0,12.1,0,4.0,,,0,0.9,1011.2,...,210,0,32,,,,,,,
2,03-dec-1955,0,10.7,0,6.4,,,3,0.0,1018.2,...,270,0,21,,,,,,,
3,04-dec-1955,0,11.6,0,4.8,,,0,1.5,1026.6,...,210,0,23,,,,,,,
4,05-dec-1955,0,11.8,0,10.7,,,0,0.8,1023.2,...,220,0,32,,,,,,,


In [84]:
df_south.tail()

Unnamed: 0,date,ind,maxtp,ind.1,mintp,igmin,gmin,ind.2,rain,cbl,...,ddhm,ind.5,hg,soil,pe,evap,smd_wd,smd_md,smd_pd,glorad
21431,26-nov-2024,0,7.5,0,2.3,0,-1.0,0,0.4,1006.9,...,40,0,15,4.643,0.3,0.3,0.4,0.4,-8.6,332
21432,27-nov-2024,0,10.0,0,3.7,0,0.4,0,0.0,1011.5,...,120,0,22,4.903,0.5,0.6,1.0,1.0,-7.6,430
21433,28-nov-2024,0,12.6,0,9.6,0,1.4,0,4.2,1013.8,...,120,0,37,8.309,1.2,1.6,0.0,-2.1,-10.0,51
21434,29-nov-2024,0,13.0,0,12.3,0,8.2,0,10.0,1008.6,...,140,0,37,11.043,0.4,0.7,0.0,-9.6,-10.0,93
21435,30-nov-2024,0,13.2,0,12.1,0,11.7,0,7.4,1006.9,...,160,0,38,11.818,0.3,0.5,0.0,-7.1,-10.0,114


In [85]:
df_south.describe()

Unnamed: 0,ind,ind.1,ind.2,ind.3,ind.4,ind.5
count,21436.0,21436.0,21436.0,21436.0,21436.0,21436.0
mean,0.013109,0.026078,0.61504,0.003732,0.146483,0.003825
std,0.225639,0.251927,1.167344,0.068201,1.724912,0.06955
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,4.0,4.0,8.0,2.0,111.0,2.0


In [86]:
df_south.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21436 entries, 0 to 21435
Data columns (total 24 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    21436 non-null  object
 1   ind     21436 non-null  int64 
 2   maxtp   21436 non-null  object
 3   ind.1   21436 non-null  int64 
 4   mintp   21436 non-null  object
 5   igmin   21436 non-null  object
 6   gmin    21436 non-null  object
 7   ind.2   21436 non-null  int64 
 8   rain    21436 non-null  object
 9   cbl     21436 non-null  object
 10  wdsp    21436 non-null  object
 11  ind.3   21436 non-null  int64 
 12  hm      21436 non-null  object
 13  ind.4   21436 non-null  int64 
 14  ddhm    21436 non-null  object
 15  ind.5   21436 non-null  int64 
 16  hg      21436 non-null  object
 17  soil    21436 non-null  object
 18  pe      21436 non-null  object
 19  evap    21436 non-null  object
 20  smd_wd  21436 non-null  object
 21  smd_md  21436 non-null  object
 22  smd_pd  21436 non-null

In [87]:
# Count of missing values in each column
print(df_south.isnull().sum())  

date      0
ind       0
maxtp     0
ind.1     0
mintp     0
igmin     0
gmin      0
ind.2     0
rain      0
cbl       0
wdsp      0
ind.3     0
hm        0
ind.4     0
ddhm      0
ind.5     0
hg        0
soil      0
pe        0
evap      0
smd_wd    0
smd_md    0
smd_pd    0
glorad    0
dtype: int64


#### Drop Unnecessary Columns
[Very Helpful Website](https://www.statology.org/pandas-drop-columns-not-in-list/)

In [108]:
keep_cols = ["date" , "rain", "maxtp",  "mintp", "cbl", "wdsp", "hm", "ddhm", "hg"]
df_new_south = df_south[df_south.columns.intersection(keep_cols)]
df_new_south.head()

Unnamed: 0,date,maxtp,mintp,rain,cbl,wdsp,hm,ddhm,hg
0,01-dec-1955,10.9,5.9,9.1,1004.2,11.5,18,170,24
1,02-dec-1955,12.1,4.0,0.9,1011.2,12.1,23,210,32
2,03-dec-1955,10.7,6.4,0.0,1018.2,8.0,16,270,21
3,04-dec-1955,11.6,4.8,1.5,1026.6,6.3,16,210,23
4,05-dec-1955,11.8,10.7,0.8,1023.2,16.5,23,220,32


### Belmullet (West)

In [88]:
filepath2 = r"C:\Users\joann\Desktop\PFDA-project\dataframes\belmullet_mayo.csv"
df_west = pd.read_csv(filepath2, skiprows= 24)
df_west.head()

Unnamed: 0,date,ind,maxtp,ind.1,mintp,igmin,gmin,ind.2,rain,cbl,...,ind.5,hg,sun,glorad,soil,pe,evap,smd_wd,smd_md,smd_pd
0,17-sep-1956,0,16.6,0,8.6,0,7.2,2,0.0,1022.6,...,0,18,,,12.725,1.3,1.9,,,
1,18-sep-1956,0,17.5,0,11.6,0,11.0,2,0.0,1017.9,...,0,14,,,14.65,1.4,1.9,,,
2,19-sep-1956,0,17.0,0,12.7,0,11.8,0,0.0,1012.9,...,0,26,,,14.6,1.1,1.6,,,
3,20-sep-1956,0,19.2,0,13.4,0,12.6,0,1.0,1005.5,...,0,38,,,14.975,1.8,2.5,,,
4,21-sep-1956,0,17.4,0,12.3,0,11.7,0,0.6,998.7,...,0,36,,,14.65,1.6,2.3,,,


In [89]:
df_west.tail()

Unnamed: 0,date,ind,maxtp,ind.1,mintp,igmin,gmin,ind.2,rain,cbl,...,ind.5,hg,sun,glorad,soil,pe,evap,smd_wd,smd_md,smd_pd
24907,26-nov-2024,0,7.9,0,0.2,0,-0.3,0,1.3,1009.7,...,0,15,1.9,224,5.627,0.2,0.3,0.0,-1.1,-10.0
24908,27-nov-2024,0,8.7,0,-1.6,0,-4.8,0,0.0,1016.7,...,0,13,4.5,352,3.646,0.0,0.1,0.0,0.0,-9.5
24909,28-nov-2024,0,12.7,0,2.1,0,-3.0,0,7.4,1014.3,...,0,35,0.0,40,5.12,0.6,0.8,0.0,-6.8,-10.0
24910,29-nov-2024,0,12.7,0,9.8,0,5.5,0,8.8,1006.8,...,0,26,0.0,32,8.69,0.6,0.8,0.0,-8.2,-10.0
24911,30-nov-2024,0,13.8,0,11.3,0,9.9,0,2.2,1003.6,...,0,46,0.1,96,9.898,0.8,1.0,0.0,-1.4,-10.0


In [90]:
df_west.describe()

Unnamed: 0,ind,maxtp,ind.1,mintp,ind.2,rain,wdsp,ind.3,ind.4,ind.5,pe,evap
count,24912.0,24912.0,24912.0,24912.0,24912.0,24912.0,24912.0,24912.0,24912.0,24912.0,24912.0,24912.0
mean,4e-05,12.931214,0.031551,7.254813,0.525891,3.288929,12.695087,0.000682,0.001084,0.000763,1.42225,2.145697
std,0.006336,3.935487,0.175035,3.988386,1.056117,4.922051,5.75121,0.032915,0.040555,0.035268,0.886757,1.382394
min,0.0,-1.1,0.0,-8.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,10.1,0.0,4.4,0.0,0.0,8.4,0.0,0.0,0.0,0.7,1.0
50%,0.0,12.7,0.0,7.5,0.0,1.3,12.0,0.0,0.0,0.0,1.2,1.9
75%,0.0,15.9,0.0,10.4,0.0,4.6,16.3,0.0,0.0,0.0,2.0,3.1
max,1.0,29.9,2.0,19.3,4.0,67.8,44.7,2.0,2.0,2.0,5.9,7.9


In [91]:
df_west.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24912 entries, 0 to 24911
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    24912 non-null  object 
 1   ind     24912 non-null  int64  
 2   maxtp   24912 non-null  float64
 3   ind.1   24912 non-null  int64  
 4   mintp   24912 non-null  float64
 5   igmin   24912 non-null  object 
 6   gmin    24912 non-null  object 
 7   ind.2   24912 non-null  int64  
 8   rain    24912 non-null  float64
 9   cbl     24912 non-null  object 
 10  wdsp    24912 non-null  float64
 11  ind.3   24912 non-null  int64  
 12  hm      24912 non-null  object 
 13  ind.4   24912 non-null  int64  
 14  ddhm    24912 non-null  object 
 15  ind.5   24912 non-null  int64  
 16  hg      24912 non-null  object 
 17  sun     24912 non-null  object 
 18  glorad  24912 non-null  object 
 19  soil    24912 non-null  object 
 20  pe      24912 non-null  float64
 21  evap    24912 non-null  float64
 22

In [92]:
print(df_west.isnull().sum())  

date      0
ind       0
maxtp     0
ind.1     0
mintp     0
igmin     0
gmin      0
ind.2     0
rain      0
cbl       0
wdsp      0
ind.3     0
hm        0
ind.4     0
ddhm      0
ind.5     0
hg        0
sun       0
glorad    0
soil      0
pe        0
evap      0
smd_wd    0
smd_md    0
smd_pd    0
dtype: int64


#### Drop Unnecessary Columns

In [109]:
keep_cols = ["date" , "rain", "maxtp",  "mintp", "cbl", "wdsp", "hm", "ddhm", "hg"]
df_new_west = df_west[df_west.columns.intersection(keep_cols)]
df_new_west.head()

Unnamed: 0,date,maxtp,mintp,rain,cbl,wdsp,hm,ddhm,hg
0,17-sep-1956,16.6,8.6,0.0,1022.6,7.4,13,100,18
1,18-sep-1956,17.5,11.6,0.0,1017.9,8.0,13,170,14
2,19-sep-1956,17.0,12.7,0.0,1012.9,11.4,18,150,26
3,20-sep-1956,19.2,13.4,1.0,1005.5,17.5,27,210,38
4,21-sep-1956,17.4,12.3,0.6,998.7,17.4,27,170,36


### Malin (North)

In [93]:
filepath3= r"C:\Users\joann\Desktop\PFDA-project\dataframes\malin.csv"
df_north = pd.read_csv(filepath3, skiprows= 24)
df_north.head()

Unnamed: 0,date,ind,maxtp,ind.1,mintp,igmin,gmin,ind.2,rain,cbl,...,ind.5,hg,sun,glorad,soil,pe,evap,smd_wd,smd_md,smd_pd
0,01-may-1955,0,9.4,0,6.9,0,3.8,0,7.4,996.2,...,2,48,,,,1.0,1.9,,,
1,02-may-1955,0,11.2,0,5.7,0,5.9,3,0.0,997.1,...,0,31,,,12.3,1.9,3.6,,,
2,03-may-1955,0,13.1,0,4.3,0,0.6,0,6.6,989.6,...,0,48,,,9.2,1.4,2.4,,,
3,04-may-1955,0,12.8,0,7.8,0,6.1,0,4.4,978.3,...,0,41,,,9.625,2.0,3.6,,,
4,05-may-1955,0,13.2,0,8.1,0,6.6,0,2.7,990.3,...,0,34,,,10.775,2.0,3.4,,,


In [94]:
df_north.tail()

Unnamed: 0,date,ind,maxtp,ind.1,mintp,igmin,gmin,ind.2,rain,cbl,...,ind.5,hg,sun,glorad,soil,pe,evap,smd_wd,smd_md,smd_pd
25412,26-nov-2024,0,8.2,0,2.9,0,2.7,0,6.2,1006.7,...,0,29,1.2,199,6.132,0.4,0.4,0.0,-5.8,-10.0
25413,27-nov-2024,0,6.5,0,1.4,0,-1.0,0,1.0,1016.5,...,0,14,1.0,209,4.949,0.2,0.2,0.0,-0.8,-10.0
25414,28-nov-2024,0,12.1,0,1.9,0,-1.3,0,2.2,1015.9,...,0,45,0.0,85,4.588,0.8,1.1,0.0,-1.4,-10.0
25415,29-nov-2024,0,13.9,0,11.8,0,2.2,0,0.7,1008.2,...,0,38,0.0,30,9.148,1.0,1.4,0.3,0.3,-9.2
25416,30-nov-2024,0,15.4,0,11.5,0,9.1,0,0.1,1003.8,...,0,40,0.6,153,10.047,1.1,1.3,1.3,1.3,-7.7


In [95]:
df_north.describe()

Unnamed: 0,ind,maxtp,ind.1,mintp,ind.2,rain,cbl,ind.3,ind.4,ind.5
count,25417.0,25417.0,25417.0,25417.0,25417.0,25417.0,25417.0,25417.0,25417.0,25417.0
mean,0.000315,12.128717,0.020144,7.252481,0.464689,2.992769,1008.928394,0.065704,0.065389,0.074674
std,0.023468,3.993466,0.142166,3.850467,0.986224,4.642094,12.484083,0.352063,0.351227,1.045327
min,0.0,-0.8,0.0,-6.2,0.0,0.0,953.6,0.0,0.0,0.0
25%,0.0,9.1,0.0,4.3,0.0,0.0,1001.2,0.0,0.0,0.0
50%,0.0,12.0,0.0,7.4,0.0,1.0,1009.9,0.0,0.0,0.0
75%,0.0,15.1,0.0,10.4,0.0,4.2,1017.8,0.0,0.0,0.0
max,2.0,27.1,2.0,18.4,5.0,80.6,1046.4,2.0,2.0,111.0


In [96]:
df_north.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25417 entries, 0 to 25416
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    25417 non-null  object 
 1   ind     25417 non-null  int64  
 2   maxtp   25417 non-null  float64
 3   ind.1   25417 non-null  int64  
 4   mintp   25417 non-null  float64
 5   igmin   25417 non-null  object 
 6   gmin    25417 non-null  object 
 7   ind.2   25417 non-null  int64  
 8   rain    25417 non-null  float64
 9   cbl     25417 non-null  float64
 10  wdsp    25417 non-null  object 
 11  ind.3   25417 non-null  int64  
 12  hm      25417 non-null  object 
 13  ind.4   25417 non-null  int64  
 14  ddhm    25417 non-null  object 
 15  ind.5   25417 non-null  int64  
 16  hg      25417 non-null  object 
 17  sun     25417 non-null  object 
 18  glorad  25417 non-null  object 
 19  soil    25417 non-null  object 
 20  pe      25417 non-null  object 
 21  evap    25417 non-null  object 
 22

In [97]:
print(df_north.isnull().sum()) 

date      0
ind       0
maxtp     0
ind.1     0
mintp     0
igmin     0
gmin      0
ind.2     0
rain      0
cbl       0
wdsp      0
ind.3     0
hm        0
ind.4     0
ddhm      0
ind.5     0
hg        0
sun       0
glorad    0
soil      0
pe        0
evap      0
smd_wd    0
smd_md    0
smd_pd    0
dtype: int64


#### Drop Unneccesary Columns

In [110]:
keep_cols = ["date" , "rain", "maxtp",  "mintp", "cbl", "wdsp", "hm", "ddhm", "hg"]
df_new_north = df_north[df_north.columns.intersection(keep_cols)]
df_new_north.head()

Unnamed: 0,date,maxtp,mintp,rain,cbl,wdsp,hm,ddhm,hg
0,01-may-1955,9.4,6.9,7.4,996.2,19.2,34,70,48
1,02-may-1955,11.2,5.7,0.0,997.1,14.2,24,310,31
2,03-may-1955,13.1,4.3,6.6,989.6,14.8,30,130,48
3,04-may-1955,12.8,7.8,4.4,978.3,19.9,27,230,41
4,05-may-1955,13.2,8.1,2.7,990.3,16.0,24,240,34


### Dublin (East)

In [98]:
filepath4= r"C:\Users\joann\Desktop\PFDA-project\dataframes\dublin.csv"
df_east = pd.read_csv(filepath4, skiprows= 24)
df_east.head()

Unnamed: 0,date,ind,maxtp,ind.1,mintp,igmin,gmin,ind.2,rain,cbl,...,hg,sun,dos,g_rad,soil,pe,evap,smd_wd,smd_md,smd_pd
0,01-jan-1942,0,9.7,0,6.8,0,4.7,2,0.0,1020.3,...,,0.0,0,,,1.1,1.4,,,
1,02-jan-1942,0,9.9,0,7.9,0,6.7,0,0.1,1016.2,...,,0.0,0,,,0.7,0.9,,,
2,03-jan-1942,0,11.2,0,8.9,0,7.2,0,1.5,1006.8,...,,0.1,0,,,0.5,0.6,,,
3,04-jan-1942,0,9.2,0,2.7,0,3.4,0,3.5,1001.5,...,,0.6,0,,,0.6,0.7,,,
4,05-jan-1942,0,3.5,1,-0.8,0,0.0,0,0.6,1013.4,...,,3.4,0,,,0.6,0.7,,,


In [99]:
df_east.tail()

Unnamed: 0,date,ind,maxtp,ind.1,mintp,igmin,gmin,ind.2,rain,cbl,...,hg,sun,dos,g_rad,soil,pe,evap,smd_wd,smd_md,smd_pd
30280,26-nov-2024,0,7.8,1,-1.9,1,-0.4,3,0.0,1000.8,...,16,7.3,0,,3.625,0.2,0.3,0.8,0.8,-7.8
30281,27-nov-2024,0,6.2,1,-3.3,1,-6.8,3,0.0,1008.4,...,13,6.7,0,,2.375,0.1,0.1,0.9,0.9,-7.4
30282,28-nov-2024,0,11.3,1,-4.1,1,-7.9,0,0.5,1011.0,...,33,0.0,0,,3.675,0.4,0.6,0.8,0.8,-7.0
30283,29-nov-2024,0,14.2,0,10.9,0,5.4,0,0.2,1005.1,...,28,0.0,0,,8.725,1.0,1.2,1.6,1.6,-5.9
30284,30-nov-2024,0,15.8,0,11.8,0,9.2,0,0.4,1001.8,...,31,0.8,0,,10.0,0.8,0.9,2.0,2.0,-5.2


In [100]:
df_east.describe()

Unnamed: 0,ind,maxtp,ind.1,mintp,igmin,ind.2,rain,cbl,wdsp,ind.3,ind.4,ind.5,sun,pe
count,30285.0,30285.0,30285.0,30285.0,30285.0,30285.0,30285.0,30285.0,30285.0,30285.0,30285.0,30285.0,30285.0,30285.0
mean,0.00109,13.072574,0.083408,6.16143,0.217897,0.88529,2.072617,1003.481199,10.187294,0.030477,0.030741,0.030279,4.014525,1.506072
std,0.033978,4.90601,0.276621,4.380311,0.416407,1.232237,4.390688,11.736406,4.595588,0.175323,0.175653,0.174223,3.759031,1.001873
min,0.0,-4.7,0.0,-12.2,0.0,0.0,0.0,949.6,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,9.5,0.0,2.9,0.0,0.0,0.0,996.2,6.8,0.0,0.0,0.0,0.5,0.7
50%,0.0,13.0,0.0,6.3,0.0,0.0,0.2,1004.5,9.5,0.0,0.0,0.0,3.2,1.3
75%,0.0,16.9,0.0,9.6,0.0,2.0,2.2,1011.7,13.0,0.0,0.0,0.0,6.5,2.2
max,2.0,29.1,2.0,18.4,4.0,4.0,92.6,1037.4,35.5,2.0,2.0,2.0,15.9,5.7


In [101]:
df_east.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30285 entries, 0 to 30284
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    30285 non-null  object 
 1   ind     30285 non-null  int64  
 2   maxtp   30285 non-null  float64
 3   ind.1   30285 non-null  int64  
 4   mintp   30285 non-null  float64
 5   igmin   30285 non-null  int64  
 6   gmin    30285 non-null  object 
 7   ind.2   30285 non-null  int64  
 8   rain    30285 non-null  float64
 9   cbl     30285 non-null  float64
 10  wdsp    30285 non-null  float64
 11  ind.3   30285 non-null  int64  
 12  hm      30285 non-null  object 
 13  ind.4   30285 non-null  int64  
 14  ddhm    30285 non-null  object 
 15  ind.5   30285 non-null  int64  
 16  hg      30285 non-null  object 
 17  sun     30285 non-null  float64
 18  dos     30285 non-null  object 
 19  g_rad   30285 non-null  object 
 20  soil    30285 non-null  object 
 21  pe      30285 non-null  float64
 22

In [102]:
print(df_east.isnull().sum()) 

date      0
ind       0
maxtp     0
ind.1     0
mintp     0
igmin     0
gmin      0
ind.2     0
rain      0
cbl       0
wdsp      0
ind.3     0
hm        0
ind.4     0
ddhm      0
ind.5     0
hg        0
sun       0
dos       0
g_rad     0
soil      0
pe        0
evap      0
smd_wd    0
smd_md    0
smd_pd    0
dtype: int64


#### Drop Unnecessary Columns

In [111]:
keep_cols = ["date" , "rain", "maxtp",  "mintp", "cbl", "wdsp", "hm", "ddhm", "hg"]
df_new_east = df_east[df_east.columns.intersection(keep_cols)]
df_new_east.head()

Unnamed: 0,date,maxtp,mintp,rain,cbl,wdsp,hm,ddhm,hg
0,01-jan-1942,9.7,6.8,0.0,1020.3,17.2,,,
1,02-jan-1942,9.9,7.9,0.1,1016.2,15.2,,,
2,03-jan-1942,11.2,8.9,1.5,1006.8,14.0,,,
3,04-jan-1942,9.2,2.7,3.5,1001.5,17.0,,,
4,05-jan-1942,3.5,-0.8,0.6,1013.4,13.0,,,


### Mullingar (The Midlands)

In [103]:
filepath5= r"C:\Users\joann\Desktop\PFDA-project\dataframes\mullingar.csv"
df_midlands = pd.read_csv(filepath5, skiprows= 24)
df_midlands.head()

Unnamed: 0,date,ind,maxtp,ind.1,mintp,igmin,gmin,ind.2,rain,cbl,...,ddhm,ind.5,hg,soil,pe,evap,smd_wd,smd_md,smd_pd,glorad
0,08-nov-1973,0,13.6,0,10.6,0,8.2,0,0.6,1010.7,...,240,0,37,9.725,0.6,0.9,,,,
1,09-nov-1973,0,12.3,0,7.1,0,10.5,0,13.9,1004.2,...,230,0,36,10.175,0.4,0.7,,,,
2,10-nov-1973,0,9.8,0,2.7,0,2.8,0,1.3,1003.9,...,300,0,36,8.6,0.4,0.5,,,,
3,11-nov-1973,0,8.2,0,1.9,1,-0.2,0,0.8,1011.2,...,280,0,29,6.05,0.4,0.6,,,,
4,12-nov-1973,0,12.6,0,6.5,1,-1.5,0,5.2,1004.2,...,260,0,45,8.05,0.7,1.0,,,,


In [104]:
df_midlands.tail()

Unnamed: 0,date,ind,maxtp,ind.1,mintp,igmin,gmin,ind.2,rain,cbl,...,ddhm,ind.5,hg,soil,pe,evap,smd_wd,smd_md,smd_pd,glorad
18646,26-nov-2024,0,8.3,0,-3.0,0,-7.7,0,0.0,998.7,...,220,0,9,3.834,0.0,0.1,0.3,0.3,-8.7,427
18647,27-nov-2024,0,3.4,0,-4.1,0,-8.4,0,0.0,1006.3,...,10,0,8,3.086,0.0,0.0,0.3,0.3,-8.3,397
18648,28-nov-2024,0,11.6,0,-2.7,0,-8.3,0,1.5,1007.7,...,170,0,25,4.336,0.3,0.5,0.0,-0.9,-9.1,96
18649,29-nov-2024,0,13.2,0,10.9,0,3.3,0,2.3,1001.3,...,160,0,25,8.954,0.6,0.8,0.0,-1.7,-10.0,42
18650,30-nov-2024,0,14.3,0,10.0,0,6.2,0,4.5,998.2,...,180,0,30,9.535,0.5,0.6,0.0,-4.0,-10.0,144


In [105]:
df_midlands.describe()

Unnamed: 0,ind,maxtp,ind.1,mintp,ind.2,rain,ind.3,ind.4,ind.5
count,18651.0,18651.0,18651.0,18651.0,18651.0,18651.0,18651.0,18651.0,18651.0
mean,0.007238,13.040909,0.0897,5.429055,0.587904,2.670575,0.003592,0.003271,0.003646
std,0.813559,5.092989,0.858233,4.601677,1.132002,4.599139,0.077757,0.073518,0.078782
min,0.0,-4.9,0.0,-14.9,0.0,0.0,0.0,0.0,0.0
25%,0.0,9.3,0.0,2.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,12.9,0.0,5.5,0.0,0.6,0.0,0.0,0.0
75%,0.0,16.9,0.0,9.0,0.0,3.5,0.0,0.0,0.0
max,111.0,30.4,111.0,17.8,4.0,71.1,2.0,2.0,2.0


In [106]:
df_midlands.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18651 entries, 0 to 18650
Data columns (total 24 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    18651 non-null  object 
 1   ind     18651 non-null  int64  
 2   maxtp   18651 non-null  float64
 3   ind.1   18651 non-null  int64  
 4   mintp   18651 non-null  float64
 5   igmin   18651 non-null  object 
 6   gmin    18651 non-null  object 
 7   ind.2   18651 non-null  int64  
 8   rain    18651 non-null  float64
 9   cbl     18651 non-null  object 
 10  wdsp    18651 non-null  object 
 11  ind.3   18651 non-null  int64  
 12  hm      18651 non-null  object 
 13  ind.4   18651 non-null  int64  
 14  ddhm    18651 non-null  object 
 15  ind.5   18651 non-null  int64  
 16  hg      18651 non-null  object 
 17  soil    18651 non-null  object 
 18  pe      18651 non-null  object 
 19  evap    18651 non-null  object 
 20  smd_wd  18651 non-null  object 
 21  smd_md  18651 non-null  object 
 22

In [107]:
print(df_midlands.isnull().sum()) 

date      0
ind       0
maxtp     0
ind.1     0
mintp     0
igmin     0
gmin      0
ind.2     0
rain      0
cbl       0
wdsp      0
ind.3     0
hm        0
ind.4     0
ddhm      0
ind.5     0
hg        0
soil      0
pe        0
evap      0
smd_wd    0
smd_md    0
smd_pd    0
glorad    0
dtype: int64


#### Drop Unnecessary Columns

In [112]:
keep_cols = ["date" , "rain", "maxtp",  "mintp", "cbl", "wdsp", "hm", "ddhm", "hg"]
df_new_midlands = df_midlands[df_midlands.columns.intersection(keep_cols)]
df_new_midlands.head()

Unnamed: 0,date,maxtp,mintp,rain,cbl,wdsp,hm,ddhm,hg
0,08-nov-1973,13.6,10.6,0.6,1010.7,17.6,23,240,37
1,09-nov-1973,12.3,7.1,13.9,1004.2,14.8,21,230,36
2,10-nov-1973,9.8,2.7,1.3,1003.9,10.4,19,300,36
3,11-nov-1973,8.2,1.9,0.8,1011.2,9.3,17,280,29
4,12-nov-1973,12.6,6.5,5.2,1004.2,20.0,25,260,45


## References
https://www.statology.org/pandas-drop-columns-not-in-list/