In [1]:
import pandas as pd
from scipy.stats import trim_mean
import numpy as np

In [2]:
# importing the csv file 
df = pd.read_csv("weather_forecast_data.csv")
df.head()                                          # printing the head of the file 

Unnamed: 0,Temperature,Humidity,Wind_Speed,Cloud_Cover,Pressure,Rain
0,23.720338,89.592641,7.335604,50.501694,1032.378759,rain
1,27.879734,46.489704,5.952484,4.990053,992.61419,no rain
2,25.069084,83.072843,1.371992,14.855784,1007.23162,no rain
3,23.62208,74.367758,7.050551,67.255282,982.632013,rain
4,20.59137,96.858822,4.643921,47.676444,980.825142,no rain


In [13]:
# let's do the statistical analysis on all numerical columns present in the dataset 

# Estimates of location 
class Locality:
    def mean(self,column):
        tot = 0
        n = len(df[column])
        for i in df[column]:
            tot += i
        return tot/n
    
    def trimmed_mean(self,column,p):
        try :
            if p > 1 and p < 100:
                n = len(df[column])
                p = int(n*(p/100))
                tot = 0
                arr = np.array(df[column])
                arr.sort()
                for i in range(p+1,n-p):
                    tot += arr[i]
                return (tot/ (n-(2*p)))
        except:
            return 'please enter a valid value'
            
    
    def median(self,column):
        arr = np.array(df[column])
        arr.sort()
        n = len(arr)
        if n % 2 != 0:                                      # odd case
            return arr[(n+1)//2]
        else:                                              # even case
            return ((arr[(n//2)] + arr[(n//2)+1])/2)
    
    def allEstimates(self,column,trim_val = 0.2):             # following function takes column as input and returns mean, trimmed mean, and median of the data 
        print(column)
        print('-'*20)
        print("Mean         :",df[column].mean())
        print("Trimmed Mean :", trim_mean(df[column],trim_val))
        print("Median       :", df[column].median())


In [16]:
loc = Locality()
loc.trimmed_mean('Humidity',0)

'value should not be more than 100 or less than 1'

In [101]:
# Estimates of variability

class Variability:
    # Mean absolute deviation/manhattan norm/ l1 norm 
    def mad(self,colname):
        mean_val = df[colname].mean()                                     # calculating the mean value
        length = len(df[colname])                                         # length of the feature
        tot = 0                                                           # creating a variable and initialising the value to 0 
        for x in df[colname]:                                             # loop through the values of that column
            tot += abs(x-mean_val)                                        # x-mean_val --> mean deviation 
        return round(tot/length,3)                                        # returning the rounded value
        
    # using numpy calculate the mean absolute deviation 
    def mad2(self,colname):
        mean_ = np.mean(df[colname])                                      # taking the mean of pd.core.series.Series
        abs_deviation = np.abs(df[colname] - mean_)                       # converting to positive values 
        mad2 = round(np.mean(abs_deviation),3)                            # calculating the mean deviation and rounding to 3 digit
        return mad2                                                       # returning the value
    
    # variance/mean squared error  
    def variance(self,colName):
        n = len(df[colName])                                             # calculating the length of the column
        tot = 0                                                          # initialised variable with 0
        mean_val = df[colName].mean()                                    # calculating the mean value for that column
        for x in df[colName]:                                            # loop
            tot += (x - mean_val)**2                                     # (x - mean_val)**2 --> squared mean deviation 
        return round(tot/(n-1),3)                                        # returing the variance
        
    def variance2(self,colName):
        return round(df[colName].var() ,3)                               # in-built function to calculate the variance   
    
    # standard deviation/ l2 norm / euclidean norm 
    def stdDev(self,colName):                                  
        return round(self.variance(colName)**0.5,6)                      # calling the variance function to take the square root of it                                        
    
    def stdDev2(self,colName):
        return round(df[colName].std(),6)                                # in-built standard deviation calling function 

    # range
    def range_(self,colname):
        return round(min(df[colname]),6),round(max(df[colname]),6)       # return minimum and maximum value of that column

    # inter quartile range 
    def interQuartilerange(self,colname):
        return df[colname].quantile(0.75) - df[colname].quantile(0.25)   # returns the difference between the 75 and 25 quantile value

    def quantile(self,colname,val = 0.01):
        if val > 0 and val < 1:                                          # checks if the value greater than 0 and less than 1 or not 
            return df[colname].quantile(val)                             # return quantile value already initialised to 0.5 to find the median 
        else:                                                            # else throws an error message 
            return "please check the value should be less than 1 and greater 0"

    def displayValue(self,colname):
        print('Mean absolute deviation :', self.mad(colname))
        print('Variance                :', self.variance(colname))
        print('Standard Deviation      :', self.stdDev(colname))
        print('range                   :', self.range(colname))
        print('Interquantile range     :', self.interQuartilerange(colname))
        print('Quantile                :', self.quantile(colname))
    

In [76]:
stat = Variability()
stat.displayValue('Wind_Speed')

Mean absolute deviation : 5.012
Variance                : 33.412
Standard Deviation      : 5.780311410296162
range                   : (0.009819, 19.999132)
Interquantile range     : 10.186498793786143
Quantile                : 0.1417012330694801


In [33]:
df.describe()

Unnamed: 0,Temperature,Humidity,Wind_Speed,Cloud_Cover,Pressure
count,2500.0,2500.0,2500.0,2500.0,2500.0
mean,22.581725,64.347094,9.906255,49.658104,1014.312336
std,7.326996,19.954739,5.780316,29.123104,20.196433
min,10.001842,30.005071,0.009819,0.015038,980.014486
25%,16.359398,47.339815,4.761909,23.900016,996.93863
50%,22.536448,63.920797,9.908572,49.488284,1013.433035
75%,28.976476,81.561021,14.948408,75.32414,1031.735067
max,34.995214,99.997481,19.999132,99.997795,1049.985593


In [23]:
def range(colname):
    return min(df[colname]),max(df[colname])
range('Temperature')

(10.001842485827652, 34.99521445292413)

34.22120589766598

In [None]:
T