<a href="https://colab.research.google.com/github/StealthyNinja26/Rainfall_austin_tx/blob/main/Rainfall_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# read the data in a pandas dataframe
data = pd.read_csv("austin_weather.csv")

# drop or delete the unnecessary columns in the data.
data = data.drop(['Events', 'Date', 'SeaLevelPressureHighInches',
                  'SeaLevelPressureLowInches'], axis = 1)

# some values have 'T' which denotes trace rainfall
# we need to replace all occurrences of T with 0
# so that we can use the data in our model
data = data.replace('T', 0.0)

# the data also contains '-' which indicates no
# or NIL. This means that data is not available
# we need to replace these values as well.
data = data.replace('-', 0.0)

# save the data in a csv file
data.to_csv('austin_final.csv')



In [None]:
data.head(5)

Unnamed: 0,TempHighF,TempAvgF,TempLowF,DewPointHighF,DewPointAvgF,DewPointLowF,HumidityHighPercent,HumidityAvgPercent,HumidityLowPercent,SeaLevelPressureAvgInches,VisibilityHighMiles,VisibilityAvgMiles,VisibilityLowMiles,WindHighMPH,WindAvgMPH,WindGustMPH,PrecipitationSumInches
0,74,60,45,67,49,43,93,75,57,29.68,10,7,2,20,4,31,0.46
1,56,48,39,43,36,28,93,68,43,30.13,10,10,5,16,6,25,0.0
2,58,45,32,31,27,23,76,52,27,30.49,10,10,10,8,3,12,0.0
3,61,46,31,36,28,21,89,56,22,30.45,10,10,7,12,4,20,0.0
4,58,50,41,44,40,36,86,71,56,30.33,10,10,7,10,2,16,0.0


In [None]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   TempHighF                  1319 non-null   int64 
 1   TempAvgF                   1319 non-null   int64 
 2   TempLowF                   1319 non-null   int64 
 3   DewPointHighF              1319 non-null   object
 4   DewPointAvgF               1319 non-null   object
 5   DewPointLowF               1319 non-null   object
 6   HumidityHighPercent        1319 non-null   object
 7   HumidityAvgPercent         1319 non-null   object
 8   HumidityLowPercent         1319 non-null   object
 9   SeaLevelPressureAvgInches  1319 non-null   object
 10  VisibilityHighMiles        1319 non-null   object
 11  VisibilityAvgMiles         1319 non-null   object
 12  VisibilityLowMiles         1319 non-null   object
 13  WindHighMPH                1319 non-null   object
 14  WindAvgM

In [None]:
def conv_func(col):
  data[col]=pd.to_numeric(data[col])

for i in data.columns:
  conv_func(i)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   TempHighF                  1319 non-null   int64  
 1   TempAvgF                   1319 non-null   int64  
 2   TempLowF                   1319 non-null   int64  
 3   DewPointHighF              1319 non-null   float64
 4   DewPointAvgF               1319 non-null   float64
 5   DewPointLowF               1319 non-null   float64
 6   HumidityHighPercent        1319 non-null   float64
 7   HumidityAvgPercent         1319 non-null   float64
 8   HumidityLowPercent         1319 non-null   float64
 9   SeaLevelPressureAvgInches  1319 non-null   float64
 10  VisibilityHighMiles        1319 non-null   float64
 11  VisibilityAvgMiles         1319 non-null   float64
 12  VisibilityLowMiles         1319 non-null   float64
 13  WindHighMPH                1319 non-null   float

In [None]:
data.duplicated().sum()

np.int64(0)

In [None]:
data.isnull().sum()

Unnamed: 0,0
TempHighF,0
TempAvgF,0
TempLowF,0
DewPointHighF,0
DewPointAvgF,0
DewPointLowF,0
HumidityHighPercent,0
HumidityAvgPercent,0
HumidityLowPercent,0
SeaLevelPressureAvgInches,0


In [None]:
def simp_mean(col):
  print("Mean value of " + col +":{:.2f}".format(data[col].mean()))

for i in data.columns:
  simp_mean(i)

Mean value of TempHighF:80.86
Mean value of TempAvgF:70.64
Mean value of TempLowF:59.90
Mean value of DewPointHighF:61.19
Mean value of DewPointAvgF:56.33
Mean value of DewPointLowF:50.66
Mean value of HumidityHighPercent:87.74
Mean value of HumidityAvgPercent:66.57
Mean value of HumidityLowPercent:44.89
Mean value of SeaLevelPressureAvgInches:29.95
Mean value of VisibilityHighMiles:9.90
Mean value of VisibilityAvgMiles:9.08
Mean value of VisibilityLowMiles:6.78
Mean value of WindHighMPH:13.21
Mean value of WindAvgMPH:5.00
Mean value of WindGustMPH:21.31
Mean value of PrecipitationSumInches:0.11


In [None]:
def trim_mean(col):
  print("Trimmed mean of " + col +":{:.2f}".format(stats.trim_mean(data[col],0.1)))

for i in data.columns:
  trim_mean(i)

Trimmed mean of TempHighF:82.13
Trimmed mean of TempAvgF:71.74
Trimmed mean of TempLowF:61.05
Trimmed mean of DewPointHighF:63.16
Trimmed mean of DewPointAvgF:58.13
Trimmed mean of DewPointLowF:52.17
Trimmed mean of HumidityHighPercent:89.55
Trimmed mean of HumidityAvgPercent:66.81
Trimmed mean of HumidityLowPercent:43.84
Trimmed mean of SeaLevelPressureAvgInches:30.01
Trimmed mean of VisibilityHighMiles:10.00
Trimmed mean of VisibilityAvgMiles:9.48
Trimmed mean of VisibilityLowMiles:7.17
Trimmed mean of WindHighMPH:13.04
Trimmed mean of WindAvgMPH:4.92
Trimmed mean of WindGustMPH:21.04
Trimmed mean of PrecipitationSumInches:0.01


In [None]:
def med(col):
  print("Median of " + col +":{:.2f}".format(data[col].median()))

for i in data.columns:
  med(i)

Median of TempHighF:83.00
Median of TempAvgF:73.00
Median of TempLowF:63.00
Median of DewPointHighF:66.00
Median of DewPointAvgF:61.00
Median of DewPointLowF:56.00
Median of HumidityHighPercent:90.00
Median of HumidityAvgPercent:67.00
Median of HumidityLowPercent:44.00
Median of SeaLevelPressureAvgInches:30.00
Median of VisibilityHighMiles:10.00
Median of VisibilityAvgMiles:10.00
Median of VisibilityLowMiles:9.00
Median of WindHighMPH:13.00
Median of WindAvgMPH:5.00
Median of WindGustMPH:21.00
Median of PrecipitationSumInches:0.00


In [None]:
def con_int(feature):
    sample = data[feature]
    confidence_level = 0.95
    degrees_freedom = sample.size - 1
    sample_mean = np.mean(sample)
    sample_standard_error = stats.sem(sample)
    confidence_interval = stats.t.interval(confidence_level, degrees_freedom, sample_mean, sample_standard_error)
    return confidence_interval

for i in data.columns:
    result = con_int(i)
    print("Confidence interval of {} = ({:.2f}, {:.2f})".format(i, result[0], result[1]))

Confidence interval of TempHighF = (80.07, 81.66)
Confidence interval of TempAvgF = (69.88, 71.40)
Confidence interval of TempLowF = (59.14, 60.67)
Confidence interval of DewPointHighF = (60.42, 61.96)
Confidence interval of DewPointAvgF = (55.50, 57.16)
Confidence interval of DewPointLowF = (49.77, 51.56)
Confidence interval of HumidityHighPercent = (87.12, 88.36)
Confidence interval of HumidityAvgPercent = (65.88, 67.26)
Confidence interval of HumidityLowPercent = (43.97, 45.81)
Confidence interval of SeaLevelPressureAvgInches = (29.88, 30.03)
Confidence interval of VisibilityHighMiles = (9.85, 9.95)
Confidence interval of VisibilityAvgMiles = (8.99, 9.17)
Confidence interval of VisibilityLowMiles = (6.58, 6.98)
Confidence interval of WindHighMPH = (13.02, 13.40)
Confidence interval of WindAvgMPH = (4.89, 5.11)
Confidence interval of WindGustMPH = (20.99, 21.63)
Confidence interval of PrecipitationSumInches = (0.09, 0.14)
