# Descriptive analysis

In [None]:
import pandas as pd

In [192]:
class DataDescriber:
    def __init__(self, filepath):
        try:
            self.df = pd.read_csv(filepath)
            self.df.set_index(self.df.columns[0], drop = True, inplace = True)
        except Exception as e:
            print('File could not be read')
            print(f'Error: {e}')

    def no_records(self):
        n = len(self.df)
        print(f'The data set has {n} rows')

    def col_types(self):
        for i in self.df.columns:
            dtype = type(self.df[i][0])
            print(f'Column {i} is of data type {dtype}')
        
    def unique_values(self):
        cols = [x for x in list(self.df.columns) if x not in ['Record_Number', 'Region']]
        for i in cols:
            vals = self.df[i].unique()
            vals.sort()
            print(f'Column {i} takes values {vals}')

    def grouped_no_records(self, col1, col2, col1_vals = None, col2_vals = None, col1_group = False, col2_group = False):
        grouped_mi = self.df.groupby([col1, col2])['Record_Number'].count()
        grouped_df = pd.DataFrame(grouped_mi)
        if col1_vals != None:
            grouped_df = grouped_df[grouped_df.index.get_level_values(0).isin(col1_vals)] 
        if col2_vals != None:
            grouped_df = grouped_df[grouped_df.index.get_level_values(1).isin(col2_vals)]
        if col1_group == True:
            grouped_df.index = grouped_df.index.droplevel(col1)
            grouped_df = grouped_df.groupby(col2).sum()
            grouped_df.rename(columns = {'Record_Number': 'Number of Records'}, inplace = True)
            display(grouped_df.round(0))
            return
        if col2_group == True:
            grouped_df.index = grouped_df.index.droplevel(col2)
            grouped_df = grouped_df.groupby(col1).sum()
            grouped_df.rename(columns = {'Record_Number': 'Number of Records'}, inplace = True)
            display(grouped_df.round(0))
            return
        grouped_df = grouped_df.unstack().fillna(0).astype(int)
        grouped_df.columns = grouped_df.columns.droplevel()
        if 'X' in grouped_df.columns:
            sorted_cols = [str(x) for x in sorted([int(x) for x in grouped_df.columns if x != 'X'])]
            sorted_cols.append('X')
        else:
            sorted_cols = sorted(grouped_df.columns)
        grouped_df = grouped_df.reindex(sorted_cols, axis = 1)
        display(grouped_df)


In [193]:
dd = DataDescriber('D:\\University\\Python for Data Analysis\\Repos\\PFDAAV\\data\\Scotland_teaching_file_1PCT_refined.csv')
dd.grouped_no_records('Hours_Worked_Per_Week', 'industry')
dd.grouped_no_records('Occupation', 'Approximate_Social_Grade')
dd.grouped_no_records('age', 'Economic_Activity', col2_vals = ['1', '2', '3', '4'])
dd.grouped_no_records('age', 'Economic_Activity', col2_vals = ['1', '2', '3', '4'], col1_group = True)
dd.grouped_no_records('age', 'Economic_Activity', col2_vals = ['1', '2', '3', '4'], col2_group = True)

industry,1,2,3,4,5,6,7,8,9,10,11,12,13,X
Hours_Worked_Per_Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,21,58,44,619,331,72,34,230,59,230,259,103,83,0
2,49,201,166,1318,635,371,224,639,328,661,1518,183,225,0
3,191,2533,1801,2255,738,1555,1048,1936,1592,1377,2591,354,362,0
4,258,559,427,411,230,432,117,425,130,253,184,57,60,0
X,379,2967,1438,2954,1375,1324,556,1588,1116,1602,2265,410,442,14435


Approximate_Social_Grade,1,2,3,4,X
Occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1413,1686,213,192,46
2,4739,1670,465,289,74
3,933,3336,377,280,89
4,707,3883,816,538,66
5,218,612,4056,1176,78
6,324,897,1530,1679,54
7,381,1393,870,2172,118
8,98,367,1619,2183,60
9,247,933,1196,4675,205
X,320,830,460,1525,11300


Economic_Activity,1,2,3,4
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,2772,82,679,1326
3,5643,501,465,269
4,6250,865,427,97
5,6360,1075,393,40
6,3712,821,202,10
7,539,230,15,0
8,74,49,2,1


Unnamed: 0_level_0,Number of Records
Economic_Activity,Unnamed: 1_level_1
1,25350
2,3623
3,2183
4,1743


Unnamed: 0_level_0,Number of Records
age,Unnamed: 1_level_1
2,4859
3,6878
4,7639
5,7868
6,4745
7,784
8,126


In [55]:
dd = DataDescriber('D:\\University\\Python for Data Analysis\\Repos\\PFDAAV\\data\\Scotland_teaching_file_1PCT_refined.csv')
dd.no_records()
dd.col_types()
dd.unique_values()

The data set has 63388 rows
Column Record_Number is of data type <class 'numpy.int64'>
Column Region is of data type <class 'str'>
Column RESIDENCE_TYPE is of data type <class 'str'>
Column Family_Composition is of data type <class 'str'>
Column sex is of data type <class 'numpy.int64'>
Column age is of data type <class 'numpy.int64'>
Column Marital_Status is of data type <class 'numpy.int64'>
Column student is of data type <class 'numpy.int64'>
Column Country_Of_Birth is of data type <class 'numpy.int64'>
Column health is of data type <class 'numpy.int64'>
Column Ethnic_Group is of data type <class 'numpy.int64'>
Column religion is of data type <class 'numpy.int64'>
Column Economic_Activity is of data type <class 'str'>
Column Occupation is of data type <class 'str'>
Column industry is of data type <class 'str'>
Column Hours_Worked_Per_Week is of data type <class 'str'>
Column Approximate_Social_Grade is of data type <class 'str'>
Column RESIDENCE_TYPE takes values ['C' 'P']
Column Fa