# Designed to Clean / Standardize any Dataset into a standard format

In [63]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import calendar
import string
from string import punctuation
from itertools import chain

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_selection import SelectPercentile, chi2, f_regression, f_classif

from sklearn import svm
from sklearn.utils import shuffle

df = pd.read_csv('cleaned_weather_data.csv', encoding='latin1')  #load Datafield csv
pd.options.mode.copy_on_write = True 


# Read and Understand the Dataset

In [65]:
df.shape

(7637, 28)

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7637 entries, 0 to 7636
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              7637 non-null   object 
 1   datetime          7637 non-null   object 
 2   tempmax           7637 non-null   float64
 3   tempmin           7637 non-null   float64
 4   temp              7637 non-null   float64
 5   feelslikemax      7637 non-null   float64
 6   feelslikemin      7637 non-null   float64
 7   feelslike         7637 non-null   float64
 8   dew               7637 non-null   float64
 9   humidity          7637 non-null   float64
 10  precip            5841 non-null   float64
 11  precipprob        7637 non-null   int64  
 12  precipcover       7637 non-null   float64
 13  windspeed         7637 non-null   float64
 14  winddir           7607 non-null   float64
 15  sealevelpressure  7355 non-null   float64
 16  cloudcover        7637 non-null   float64


In [67]:
df.head(10)   #shows the first 10 rows of the CSV to highlight what the data looks like


Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarradiation,solarenergy,uvindex,sunrise,sunset,moonphase,conditions,description,icon,stations
0,"Port of Spain, Trinidad, Trinidad And Tobago",2000-01-01,87.9,71.7,80.0,94.4,71.7,82.7,72.0,78.0,...,,,,2000-01-01T06:24:06,2000-01-01T17:54:48,0.83,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999
1,"Port of Spain, Trinidad, Trinidad And Tobago",2000-01-02,87.9,71.3,78.2,96.1,71.3,80.5,72.5,83.7,...,,,,2000-01-02T06:24:30,2000-01-02T17:55:20,0.86,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999
2,"Port of Spain, Trinidad, Trinidad And Tobago",2000-01-03,89.7,73.5,79.3,95.0,73.5,81.2,71.6,78.6,...,,,,2000-01-03T06:24:53,2000-01-03T17:55:53,0.9,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999
3,"Port of Spain, Trinidad, Trinidad And Tobago",2000-01-04,84.3,71.3,76.1,89.8,71.3,77.2,70.9,84.9,...,,,,2000-01-04T06:25:16,2000-01-04T17:56:26,0.93,Partially cloudy,Clearing in the afternoon.,partly-cloudy-day,78970099999
4,"Port of Spain, Trinidad, Trinidad And Tobago",2000-01-05,81.2,73.1,76.1,86.7,73.1,76.5,73.9,93.0,...,,,,2000-01-05T06:25:37,2000-01-05T17:56:59,0.96,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999
5,"Port of Spain, Trinidad, Trinidad And Tobago",2000-01-06,87.9,73.5,77.9,96.1,73.5,79.7,73.4,86.7,...,,,,2000-01-06T06:25:58,2000-01-06T17:57:31,0.0,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999
6,"Port of Spain, Trinidad, Trinidad And Tobago",2000-01-07,87.5,73.5,77.3,92.9,73.5,78.4,71.5,83.4,...,,,,2000-01-07T06:26:18,2000-01-07T17:58:04,0.03,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999
7,"Port of Spain, Trinidad, Trinidad And Tobago",2000-01-08,87.9,72.8,79.7,92.9,72.8,81.6,70.6,75.8,...,,,,2000-01-08T06:26:38,2000-01-08T17:58:36,0.07,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999
8,"Port of Spain, Trinidad, Trinidad And Tobago",2000-01-09,87.9,71.7,78.5,95.4,71.7,81.3,72.3,82.3,...,,,,2000-01-09T06:26:56,2000-01-09T17:59:09,0.1,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999
9,"Port of Spain, Trinidad, Trinidad And Tobago",2000-01-10,88.4,73.5,79.6,95.7,73.5,82.0,71.9,78.8,...,,,,2000-01-10T06:27:13,2000-01-10T17:59:41,0.14,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999


In [68]:
df.describe()

Unnamed: 0,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,precipprob,precipcover,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,moonphase
count,7637.0,7637.0,7637.0,7637.0,7637.0,7637.0,7637.0,7637.0,5841.0,7637.0,7637.0,7637.0,7607.0,7355.0,7637.0,7635.0,4018.0,4018.0,4018.0,7637.0
mean,89.018712,74.740016,80.68612,97.527,74.74524,84.049849,73.453765,80.131793,0.18622,43.799921,5.225413,15.349758,92.630695,1012.59482,60.136651,7.109679,225.828571,19.502837,7.856396,0.482805
std,2.99024,2.307903,1.964805,4.983782,2.326527,3.392538,2.326326,6.116863,0.439652,49.61735,14.130285,4.611617,32.288826,1.54614,18.650294,1.017579,55.495058,4.789561,1.688909,0.288445
min,71.7,37.3,71.0,71.7,37.3,71.0,63.2,57.2,0.0,0.0,0.0,0.0,0.1,1006.0,17.5,2.7,0.0,0.0,0.0,0.0
25%,87.8,73.4,79.3,94.2,73.4,81.5,71.9,75.7,0.0,0.0,0.0,12.8,73.9,1011.6,43.5,6.5,194.025,16.8,7.0,0.25
50%,89.6,75.2,80.8,97.8,75.2,84.1,74.0,79.8,0.016,0.0,0.0,15.0,91.4,1012.7,58.3,7.0,236.4,20.4,8.0,0.48
75%,91.4,76.7,82.2,100.6,76.7,86.6,75.3,84.5,0.161,100.0,4.17,17.2,104.9,1013.7,77.1,7.6,269.0,23.2,9.0,0.75
max,138.1,80.9,86.2,138.1,87.0,94.6,78.2,99.3,6.142,100.0,100.0,118.6,360.0,1017.6,100.0,16.8,328.7,28.2,10.0,0.98


In [69]:
df.isna()   # returns all Null values, where Null = true.

Unnamed: 0,name,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,...,solarradiation,solarenergy,uvindex,sunrise,sunset,moonphase,conditions,description,icon,stations
0,False,False,False,False,False,False,False,False,False,False,...,True,True,True,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,True,True,True,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,True,True,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,True,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,True,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7632,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7633,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7634,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7635,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [70]:
df.isna().sum()

name                   0
datetime               0
tempmax                0
tempmin                0
temp                   0
feelslikemax           0
feelslikemin           0
feelslike              0
dew                    0
humidity               0
precip              1796
precipprob             0
precipcover            0
windspeed              0
winddir               30
sealevelpressure     282
cloudcover             0
visibility             2
solarradiation      3619
solarenergy         3619
uvindex             3619
sunrise                0
sunset                 0
moonphase              0
conditions             0
description            0
icon                   0
stations               0
dtype: int64

In [71]:
for col in df.columns:
    print(col,df[col].nunique())

name 1
datetime 7637
tempmax 135
tempmin 96
temp 123
feelslikemax 272
feelslikemin 99
feelslike 198
dew 129
humidity 333
precip 590
precipprob 2
precipcover 25
windspeed 120
winddir 1102
sealevelpressure 101
cloudcover 698
visibility 96
solarradiation 1754
solarenergy 227
uvindex 11
sunrise 7637
sunset 7637
moonphase 96
conditions 5
description 33
icon 4
stations 13


# Basic Data Preprocessing

Sunrise / sunset data convereted to just only contain time in 24 hour format.
Icon / Station / wind direction / sea pressure data was removed as deemed unncessary.
Tempratures converted to C from F.
Onehat Encoding for Condition.
Season Categroy added, converting it to Wet and Dry.
Precip converted from Inches to MM.
name truncated to only location (example port of spain)


In [73]:
df.apply(pd.isnull).sum()/df.shape[0]

name                0.000000
datetime            0.000000
tempmax             0.000000
tempmin             0.000000
temp                0.000000
feelslikemax        0.000000
feelslikemin        0.000000
feelslike           0.000000
dew                 0.000000
humidity            0.000000
precip              0.235171
precipprob          0.000000
precipcover         0.000000
windspeed           0.000000
winddir             0.003928
sealevelpressure    0.036925
cloudcover          0.000000
visibility          0.000262
solarradiation      0.473877
solarenergy         0.473877
uvindex             0.473877
sunrise             0.000000
sunset              0.000000
moonphase           0.000000
conditions          0.000000
description         0.000000
icon                0.000000
stations            0.000000
dtype: float64

In [74]:
df["precip"].value_counts() / df.shape[0]

precip
0.000    0.326830
0.039    0.026843
0.079    0.023177
0.012    0.023046
0.020    0.020427
           ...   
0.751    0.000131
2.728    0.000131
1.157    0.000131
1.410    0.000131
0.705    0.000131
Name: count, Length: 590, dtype: float64

In [75]:
df["datetime"] = pd.to_datetime(df["datetime"])
df["day"] = df["datetime"].dt.day
df["month"] = df["datetime"].dt.month
df["year"] = df["datetime"].dt.year


ref_df = df.loc['01/0/2005':'31/12/2020']

day_avg_precip = (ref_df[ref_df['precip'].notna()].groupby(['month', 'day'])['precip'].mean())
missing_mask = (df.year <= 2004) & (df['precip'].isna())
df.loc[missing_mask, 'precip'] = df[missing_mask].apply(lambda row: day_avg_precip.get((row['month'], row['day']), None),axis=1)


df.drop(columns=['month', 'day', 'year'], inplace=True)


In [76]:
def drop_name(name):
        parts = name.split(',')  
        location = (parts[0]) 
        return location

df[[ 'location']] = df ['name'].apply(lambda x: pd.Series(drop_name(x)))
df.drop(columns=['name'],inplace=True)
df.head(10)

Unnamed: 0,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,solarenergy,uvindex,sunrise,sunset,moonphase,conditions,description,icon,stations,location
0,2000-01-01,87.9,71.7,80.0,94.4,71.7,82.7,72.0,78.0,0.385,...,,,2000-01-01T06:24:06,2000-01-01T17:54:48,0.83,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain
1,2000-01-02,87.9,71.3,78.2,96.1,71.3,80.5,72.5,83.7,0.08275,...,,,2000-01-02T06:24:30,2000-01-02T17:55:20,0.86,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain
2,2000-01-03,89.7,73.5,79.3,95.0,73.5,81.2,71.6,78.6,0.193,...,,,2000-01-03T06:24:53,2000-01-03T17:55:53,0.9,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain
3,2000-01-04,84.3,71.3,76.1,89.8,71.3,77.2,70.9,84.9,0.40375,...,,,2000-01-04T06:25:16,2000-01-04T17:56:26,0.93,Partially cloudy,Clearing in the afternoon.,partly-cloudy-day,78970099999,Port of Spain
4,2000-01-05,81.2,73.1,76.1,86.7,73.1,76.5,73.9,93.0,0.0275,...,,,2000-01-05T06:25:37,2000-01-05T17:56:59,0.96,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain
5,2000-01-06,87.9,73.5,77.9,96.1,73.5,79.7,73.4,86.7,0.01975,...,,,2000-01-06T06:25:58,2000-01-06T17:57:31,0.0,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain
6,2000-01-07,87.5,73.5,77.3,92.9,73.5,78.4,71.5,83.4,0.11025,...,,,2000-01-07T06:26:18,2000-01-07T17:58:04,0.03,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain
7,2000-01-08,87.9,72.8,79.7,92.9,72.8,81.6,70.6,75.8,0.245,...,,,2000-01-08T06:26:38,2000-01-08T17:58:36,0.07,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain
8,2000-01-09,87.9,71.7,78.5,95.4,71.7,81.3,72.3,82.3,0.0295,...,,,2000-01-09T06:26:56,2000-01-09T17:59:09,0.1,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain
9,2000-01-10,88.4,73.5,79.6,95.7,73.5,82.0,71.9,78.8,0.123,...,,,2000-01-10T06:27:13,2000-01-10T17:59:41,0.14,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain


In [77]:
def convert_sunset(date):
        parts = date.split('T')  
        sunset_time = (parts[1]) 
        return sunset_time

df[[ 'sunrise_time']] = df ['sunrise'].apply(lambda x: pd.Series(convert_sunset(x)))
df.drop(columns=['sunrise'],inplace=True)
df[[ 'sunset_time']] = df ['sunset'].apply(lambda x: pd.Series(convert_sunset(x)))
df.drop(columns=['sunset'],inplace=True)

df.head(10)

Unnamed: 0,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,...,solarenergy,uvindex,moonphase,conditions,description,icon,stations,location,sunrise_time,sunset_time
0,2000-01-01,87.9,71.7,80.0,94.4,71.7,82.7,72.0,78.0,0.385,...,,,0.83,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain,06:24:06,17:54:48
1,2000-01-02,87.9,71.3,78.2,96.1,71.3,80.5,72.5,83.7,0.08275,...,,,0.86,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain,06:24:30,17:55:20
2,2000-01-03,89.7,73.5,79.3,95.0,73.5,81.2,71.6,78.6,0.193,...,,,0.9,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain,06:24:53,17:55:53
3,2000-01-04,84.3,71.3,76.1,89.8,71.3,77.2,70.9,84.9,0.40375,...,,,0.93,Partially cloudy,Clearing in the afternoon.,partly-cloudy-day,78970099999,Port of Spain,06:25:16,17:56:26
4,2000-01-05,81.2,73.1,76.1,86.7,73.1,76.5,73.9,93.0,0.0275,...,,,0.96,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain,06:25:37,17:56:59
5,2000-01-06,87.9,73.5,77.9,96.1,73.5,79.7,73.4,86.7,0.01975,...,,,0.0,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain,06:25:58,17:57:31
6,2000-01-07,87.5,73.5,77.3,92.9,73.5,78.4,71.5,83.4,0.11025,...,,,0.03,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain,06:26:18,17:58:04
7,2000-01-08,87.9,72.8,79.7,92.9,72.8,81.6,70.6,75.8,0.245,...,,,0.07,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain,06:26:38,17:58:36
8,2000-01-09,87.9,71.7,78.5,95.4,71.7,81.3,72.3,82.3,0.0295,...,,,0.1,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain,06:26:56,17:59:09
9,2000-01-10,88.4,73.5,79.6,95.7,73.5,82.0,71.9,78.8,0.123,...,,,0.14,Partially cloudy,Partly cloudy throughout the day.,partly-cloudy-day,78970099999,Port of Spain,06:27:13,17:59:41


In [78]:
df.drop(columns=['icon'],inplace=True)
df.drop(columns=['stations'],inplace=True)
df.drop(columns=['winddir'],inplace=True)
df.drop(columns=['sealevelpressure'],inplace=True)
df.drop(columns=['description'],inplace=True)
df.drop(columns=['conditions'],inplace=True)
df.drop(columns=['solarradiation'],inplace=True)
df.drop(columns=['solarenergy'],inplace=True)
df.drop(columns=['uvindex'],inplace=True)
df.drop(columns=['moonphase'],inplace=True)
df.drop(columns=['precipprob'],inplace=True)
df.head(10)

Unnamed: 0,datetime,tempmax,tempmin,temp,feelslikemax,feelslikemin,feelslike,dew,humidity,precip,precipcover,windspeed,cloudcover,visibility,location,sunrise_time,sunset_time
0,2000-01-01,87.9,71.7,80.0,94.4,71.7,82.7,72.0,78.0,0.385,0.0,15.0,43.8,7.6,Port of Spain,06:24:06,17:54:48
1,2000-01-02,87.9,71.3,78.2,96.1,71.3,80.5,72.5,83.7,0.08275,0.0,16.1,57.9,8.2,Port of Spain,06:24:30,17:55:20
2,2000-01-03,89.7,73.5,79.3,95.0,73.5,81.2,71.6,78.6,0.193,0.0,12.8,73.2,9.5,Port of Spain,06:24:53,17:55:53
3,2000-01-04,84.3,71.3,76.1,89.8,71.3,77.2,70.9,84.9,0.40375,0.0,13.9,77.7,9.7,Port of Spain,06:25:16,17:56:26
4,2000-01-05,81.2,73.1,76.1,86.7,73.1,76.5,73.9,93.0,0.0275,0.0,11.4,76.0,8.2,Port of Spain,06:25:37,17:56:59
5,2000-01-06,87.9,73.5,77.9,96.1,73.5,79.7,73.4,86.7,0.01975,0.0,18.3,80.0,6.8,Port of Spain,06:25:58,17:57:31
6,2000-01-07,87.5,73.5,77.3,92.9,73.5,78.4,71.5,83.4,0.11025,0.0,15.0,64.2,7.4,Port of Spain,06:26:18,17:58:04
7,2000-01-08,87.9,72.8,79.7,92.9,72.8,81.6,70.6,75.8,0.245,0.0,13.9,42.2,8.4,Port of Spain,06:26:38,17:58:36
8,2000-01-09,87.9,71.7,78.5,95.4,71.7,81.3,72.3,82.3,0.0295,0.0,16.1,51.4,7.1,Port of Spain,06:26:56,17:59:09
9,2000-01-10,88.4,73.5,79.6,95.7,73.5,82.0,71.9,78.8,0.123,0.0,18.3,42.4,8.5,Port of Spain,06:27:13,17:59:41


In [79]:
def inches_to_mm(inches):
    return inches * 25.4

location = df['location'].iloc[0]
precip_row_name = f'{location} precip mm'
df[precip_row_name] = df ['precip'].apply(lambda x: pd.Series(inches_to_mm(x)))
df.drop(columns=['precip'],inplace=True)


In [80]:
def fahrenheit_to_celsius(fahrenheit):
    celsius = (fahrenheit - 32) * 5 / 9
    return round(celsius, 1)


df[[ 'tempmax c']] = df ['tempmax'].apply(lambda x: pd.Series(fahrenheit_to_celsius(x)))
df[[ 'tempmin c']] = df ['tempmin'].apply(lambda x: pd.Series(fahrenheit_to_celsius(x)))
df[[ 'avgtemp c']] = df ['temp'].apply(lambda x: pd.Series(fahrenheit_to_celsius(x)))

df[[ 'feelslikemax c']] = df ['feelslikemax'].apply(lambda x: pd.Series(fahrenheit_to_celsius(x)))
df[[ 'feelslikemin c']] = df ['feelslikemin'].apply(lambda x: pd.Series(fahrenheit_to_celsius(x)))
df[[ 'avgfeelsliketemp c']] = df ['feelslike'].apply(lambda x: pd.Series(fahrenheit_to_celsius(x)))

df[[ 'dewpoint c']] = df ['dew'].apply(lambda x: pd.Series(fahrenheit_to_celsius(x)))

df.drop(columns=['temp'],inplace=True)
df.drop(columns=['tempmax'],inplace=True)
df.drop(columns=['tempmin'],inplace=True)

df.drop(columns=['feelslike'],inplace=True)
df.drop(columns=['feelslikemax'],inplace=True)
df.drop(columns=['feelslikemin'],inplace=True)
df.drop(columns=['dew'],inplace=True)

df.head(10)

Unnamed: 0,datetime,humidity,precipcover,windspeed,cloudcover,visibility,location,sunrise_time,sunset_time,Port of Spain precip mm,tempmax c,tempmin c,avgtemp c,feelslikemax c,feelslikemin c,avgfeelsliketemp c,dewpoint c
0,2000-01-01,78.0,0.0,15.0,43.8,7.6,Port of Spain,06:24:06,17:54:48,9.779,31.1,22.1,26.7,34.7,22.1,28.2,22.2
1,2000-01-02,83.7,0.0,16.1,57.9,8.2,Port of Spain,06:24:30,17:55:20,2.10185,31.1,21.8,25.7,35.6,21.8,26.9,22.5
2,2000-01-03,78.6,0.0,12.8,73.2,9.5,Port of Spain,06:24:53,17:55:53,4.9022,32.1,23.1,26.3,35.0,23.1,27.3,22.0
3,2000-01-04,84.9,0.0,13.9,77.7,9.7,Port of Spain,06:25:16,17:56:26,10.25525,29.1,21.8,24.5,32.1,21.8,25.1,21.6
4,2000-01-05,93.0,0.0,11.4,76.0,8.2,Port of Spain,06:25:37,17:56:59,0.6985,27.3,22.8,24.5,30.4,22.8,24.7,23.3
5,2000-01-06,86.7,0.0,18.3,80.0,6.8,Port of Spain,06:25:58,17:57:31,0.50165,31.1,23.1,25.5,35.6,23.1,26.5,23.0
6,2000-01-07,83.4,0.0,15.0,64.2,7.4,Port of Spain,06:26:18,17:58:04,2.80035,30.8,23.1,25.2,33.8,23.1,25.8,21.9
7,2000-01-08,75.8,0.0,13.9,42.2,8.4,Port of Spain,06:26:38,17:58:36,6.223,31.1,22.7,26.5,33.8,22.7,27.6,21.4
8,2000-01-09,82.3,0.0,16.1,51.4,7.1,Port of Spain,06:26:56,17:59:09,0.7493,31.1,22.1,25.8,35.2,22.1,27.4,22.4
9,2000-01-10,78.8,0.0,18.3,42.4,8.5,Port of Spain,06:27:13,17:59:41,3.1242,31.3,23.1,26.4,35.4,23.1,27.8,22.2


In [81]:
df = df.fillna(method="ffill")

  df = df.fillna(method="ffill")


In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7637 entries, 0 to 7636
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   datetime                 7637 non-null   datetime64[ns]
 1   humidity                 7637 non-null   float64       
 2   precipcover              7637 non-null   float64       
 3   windspeed                7637 non-null   float64       
 4   cloudcover               7637 non-null   float64       
 5   visibility               7637 non-null   float64       
 6   location                 7637 non-null   object        
 7   sunrise_time             7637 non-null   object        
 8   sunset_time              7637 non-null   object        
 9   Port of Spain precip mm  7637 non-null   float64       
 10  tempmax c                7637 non-null   float64       
 11  tempmin c                7637 non-null   float64       
 12  avgtemp c                7637 non-

In [83]:
for col in df.columns:
    print(col,df[col].nunique())

datetime 7637
humidity 333
precipcover 25
windspeed 120
cloudcover 698
visibility 96
location 1
sunrise_time 2382
sunset_time 2518
Port of Spain precip mm 817
tempmax c 101
tempmin c 70
avgtemp c 72
feelslikemax c 175
feelslikemin c 74
avgfeelsliketemp c 116
dewpoint c 76


In [84]:
df.isna().sum()

datetime                   0
humidity                   0
precipcover                0
windspeed                  0
cloudcover                 0
visibility                 0
location                   0
sunrise_time               0
sunset_time                0
Port of Spain precip mm    0
tempmax c                  0
tempmin c                  0
avgtemp c                  0
feelslikemax c             0
feelslikemin c             0
avgfeelsliketemp c         0
dewpoint c                 0
dtype: int64

In [85]:
df.to_csv("refined_data.csv", index=False)