# Data Science Challenge

## Data Description
Column | Description
:---|:---
`id` | Record index
`timestamp` | Datetime (YYYY:MM:DD HH:MM:SS)
`country` | Current country of employment
`employment_status` | Employment status (Full time, Part time, Independent or freelancer)
`job_title` | Current job title of the candidate
`job_years` | No. of years working on the job
`is_manager` | Whether the candidate holds a managerial position or not (Yes or No)
`hours_per_week` | No. of hours per day committed to the current job
`telecommute_days_per_week` | No. of telecommuting days per week (working from home)
`education` | Highest degree in education the candidate has received
`is_education_computer_related` | Is the education related to the field of computer science (Yes or No)
`certifications` | Does the candidate have any relevant certifications (Yes or No)
`salary` | Monthly Salary (in US $$)

In [516]:
import json
import numpy as np
import pandas as pd
import geopandas as gpd
from statistics import mean
from datetime import datetime
from collections import Counter
import matplotlib.pyplot as plt
from collections import defaultdict

pd.set_option('display.max_columns', 101)

In [9]:
class Data():
    def __init__(self):
        self.train = pd.read_csv('train.csv')
        self.test = pd.read_csv('test.csv')

<class 'pandas.core.series.Series'>
{'country': [], 'code': [], 'count': []}


In [520]:
class EDA(Data):
    def __init__(self):
        super().__init__()
#         self.describe(self.train)
#         self.simple_plot(self.train)
#         self.range_plot(self.train)
#         self.trend_plot(self.train)
        self.map_plot(self.train)
        
    def describe(self, train):
        print('Column Headers')
        print(train.columns)
        print()
        print('Description of the numerical values of the data')
        print(train.describe())
        print()
        print('Unique values for columns')
        print('Countries:')
        print(train.country.unique())
        print()
        print('Employment Status:')
        print(train.employment_status.unique())
        print()
        print('Job Titles:')
        print(train.job_title.unique())
        print()
        print('Education:')
        print(train.education.unique())
        print()
        
    def simple_plot(self, train):
        train['country'].value_counts().plot(kind='bar', figsize = (20, 10), rot = 90)
        plt.title('Country')
        simple_count = {
            'Emplyment Status': train['employment_status'].value_counts(),
            'Job Title': train['job_title'].value_counts(),
            'Manager': train['is_manager'].value_counts(),
            'Education': train['education'].value_counts(),
            'Computer Education': train['is_education_computer_related'].value_counts(),
            'Certifications': train['certifications'].value_counts(),
            'Telecommute Days per Week': train['telecommute_days_per_week'].value_counts()
        }
        figure, axis = plt.subplots(2, 3, figsize = (20, 10))
        for i, (title, data) in enumerate(simple_count.items()):
            axis[i // 3, i % 3].bar(data.index, data.values)
            axis[i // 3, i % 3].set_title(title)
            for tick in axis[i // 3, i % 3].get_xticklabels():
                tick.set_rotation(90)
        plt.subplots_adjust(top = 0.99, bottom = 0.01, hspace = 1.5, wspace = 0.4)
        plt.show()
    
    def range_plot(self, train):
        def data_range(data, breaks = 10):
            low, high = data.min(), data.max()
            x, y = np.linspace(low, high, num = breaks + 1), [0] * breaks
            for value in data.values:
                for i in range(len(x) - 1):
                    if (value >= x[i] and value < x[i + 1]): y[i] += 1
                if (value == high): y[-1] += 1
            return dict(zip(x, y))
        range_count = {
            'Job Year': data_range(train['job_years']),
            'Hours Per Week': data_range(train['hours_per_week']),
            'Salary': data_range(train['salary'])
        }
        figure, axis = plt.subplots(3, 1, figsize = (15, 10))
        for i, (title, data) in enumerate(range_count.items()):
            axis[i % 3].bar(data.keys(), data.values(), width = 300 if title == 'Salary' else 0.5)
            axis[i % 3].set_title(title)
        plt.subplots_adjust(hspace = 0.25)
        plt.show()
        
    def trend_plot(self, train):
        # date = mm/dd/yyyy
        trend_count = []
        yyyymmdd = defaultdict(list)
        for index, row in train.iterrows():
            mm, dd, yyyy = row['timestamp'].split(' ')[0].split('/')
            yyyymmdd[str(yyyy) + str(mm) + str(dd)].append(row['salary'])
        for key, value in yyyymmdd.items():
            trend_count.append([key, mean(value)])
        plt.plot(*zip(*trend_count))
        ax = plt.gca()
        ax.axes.xaxis.set_visible(False)
        plt.xlabel('Time')
        plt.ylabel('Salary')
        plt.title('Salary vs Time')
        plt.rcParams["figure.figsize"] = (15, 7)
        plt.show()
    
    def map_plot(self, train):
        countries = train['country'].unique()
        print(country)
        num_employees_manager_country = defaultdict(list)
        avg_salary_employment_status_country = defaultdict(list)
        education_country = defaultdict(list)
        
#         for country in countries:
        row = train.loc[train['country'] == 'Sweden']
        print(row)
#         num_employees_manager_country[row['country']]
        

In [521]:
EDA()

['Slovenia' 'United States' 'Sweden' 'United Kingdom' 'Canada'
 'New Zealand' 'Belgium' 'France' 'Australia' 'India' 'Denmark' 'Romania'
 'Poland' 'Norway' 'Croatia' 'Netherlands' 'Hungary' 'Argentina'
 'Costa Rica' 'Switzerland' 'Germany' 'Mexico' 'Ukraine' 'Spain'
 'Czech Republic' 'Portugal' 'South Africa' 'Hong Kong' 'Russia' 'Ireland'
 'Guernsey' 'Israel' 'Bulgaria' 'Uganda' 'Finland' 'Italy' 'Jersey'
 'United Arab Emirates' 'Austria' 'Turkey' 'Bahrain' 'Greece' 'Colombia'
 'Kenya' 'Peru' 'Saudi Arabia' 'Albania' 'Iceland' 'Guatemala' 'Belarus'
 'Moldova' 'Puerto Rico' 'Brazil' 'Indonesia' 'Slovakia'
 'Serbia and Montenegro' 'Singapore' 'Malta' 'Venezuela' 'Latvia' 'China'
 'Ecuador' 'Pakistan' 'Vietnam' 'Bolivia' 'Paraguay' 'Thailand'
 'Lithuania' 'Jordan' 'Macedonia' 'Malaysia' 'Luxembourg' 'Philippines'
 'Syria' 'Ghana' 'Taiwan' 'Estonia' 'Uruguay']


<__main__.EDA at 0x1459d1ca0>