In [17]:
import pandas as pd

In [18]:
#pandas dataframes are essentially dictionaries

people = {
    "ID": "1",
    "first": "Shatha",
    "last": ["Maayouf", "Fitouri"],
    "age": [24,35,10]
}

#as a data frame all values have to be stored in a list
#and all arrays are the same length
people_as_dataframe = {
    "ID": [1,2,3],
    "first": ["Shatha", "ahmed", "laila"],
    "last": ["Maayouf", "Fitouri", "Tili"],
    "age": [24,35,10]
}

In [19]:
#value of dictionary key
people["ID"]

'1'

In [20]:
#convert dictionary to a pandas dataframe
df_from_dictionary = pd.DataFrame(people_as_dataframe)
df_from_dictionary

#dictionary keys --> column names
#dictionary values --> column values

Unnamed: 0,ID,first,last,age
0,1,Shatha,Maayouf,24
1,2,ahmed,Fitouri,35
2,3,laila,Tili,10


In [21]:
#retrieve values in column just like dictionary
#this is a series
df_from_dictionary["first"]

0    Shatha
1     ahmed
2     laila
Name: first, dtype: object

In [22]:
#Using Index_col=1 we place the index column at the beginning
df = pd.read_csv("./data/employees_satisfaction.csv", index_col=0)
df.head()

Unnamed: 0,emp_id,age,Dept,education,recruitment_type,job_level,rating,awards,certifications,salary,gender,entry_date,last_raise,satisfied
0,HR8270,28,HR,PG,Referral,5,2.0,1,0,86750,m,2019-02-01,,1
1,TECH1860,50,Technology,PG,Recruitment Agency,3,5.0,2,1,42419,Male,2017-01-17,,0
2,TECH6390,43,Technology,UG,Referral,4,1.0,2,0,65715,f,2012-08-27,,1
3,SAL6191,44,Sales,PG,On-Campus,2,3.0,0,0,29805,f,2017-07-25,,1
4,HR6734,33,HR,UG,Recruitment Agency,2,1.0,5,0,29805,m,2019-05-17,,1


# what is a Series?

In [23]:
type(df["age"])

pandas.core.series.Series

#### A dataframe is rows and columns --> 2D
#### a Series is rows of a single column
#### so a dataframe is a container for a multiple of series objects

In [24]:
#this
df["age"]

#or this
df.age

#produce a Series

0      28
1      50
2      43
3      44
4      33
       ..
495    49
496    24
497    34
498    26
499    26
Name: age, Length: 500, dtype: int64

__df["age"]__  is better because:
1. there is a chance that a column name matches one of the dataframe methods and attributes

# Columns and Rows

#### Find out number of rows

In [25]:
len(df.index)

500

In [26]:
len(df.axes[0])

500

In [27]:
len(df)

500

#### Find out number of Column

In [28]:
len(df.axes[1])

14

In [29]:
len(df.columns)

14

#### Show the column of df

In [30]:
list(df.axes[1])

['emp_id',
 'age',
 'Dept',
 'education',
 'recruitment_type',
 'job_level',
 'rating',
 'awards',
 'certifications',
 'salary',
 'gender',
 'entry_date',
 'last_raise',
 'satisfied']

In [31]:
list(df.columns)

['emp_id',
 'age',
 'Dept',
 'education',
 'recruitment_type',
 'job_level',
 'rating',
 'awards',
 'certifications',
 'salary',
 'gender',
 'entry_date',
 'last_raise',
 'satisfied']

In [32]:
list(df.keys())

['emp_id',
 'age',
 'Dept',
 'education',
 'recruitment_type',
 'job_level',
 'rating',
 'awards',
 'certifications',
 'salary',
 'gender',
 'entry_date',
 'last_raise',
 'satisfied']

In [33]:
list(df.columns.values)

['emp_id',
 'age',
 'Dept',
 'education',
 'recruitment_type',
 'job_level',
 'rating',
 'awards',
 'certifications',
 'salary',
 'gender',
 'entry_date',
 'last_raise',
 'satisfied']

#### What is the data type of each column?

In [34]:
display(df.dtypes)

emp_id               object
age                   int64
Dept                 object
education            object
recruitment_type     object
job_level             int64
rating              float64
awards                int64
certifications        int64
salary                int64
gender               object
entry_date           object
last_raise          float64
satisfied             int64
dtype: object

#### Determine statistical characteristics of the numeric columns.

In [35]:
df.describe()
#oder
df.describe(include='all')

Unnamed: 0,emp_id,age,Dept,education,recruitment_type,job_level,rating,awards,certifications,salary,gender,entry_date,last_raise,satisfied
count,500,500.0,500,500,500,500.0,471.0,500.0,500.0,500.0,497,500,26.0,500.0
unique,499,,5,2,4,,,,,,4,475,,
top,MKT7287,,Purchasing,PG,Referral,,,,,,m,2019-09-01,,
freq,2,,114,254,140,,,,,,207,3,,
mean,,39.694,,,,3.032,3.093418,4.57,0.514,50416.056,,,0.049231,0.714
std,,8.477033,,,,1.423738,1.423129,2.989812,0.628167,23671.392661,,,0.029519,0.452342
min,,22.0,,,,1.0,1.0,0.0,0.0,24076.0,,,0.01,0.0
25%,,34.0,,,,2.0,2.0,2.0,0.0,29805.0,,,0.02,0.0
50%,,39.0,,,,3.0,3.0,5.0,0.0,42419.0,,,0.05,1.0
75%,,47.0,,,,4.0,4.0,7.0,1.0,65715.0,,,0.0775,1.0


#### Show NULL values by replacing the df values with True for NULL values, otherwise False

In [36]:
newdf = df.isnull()
newdf

Unnamed: 0,emp_id,age,Dept,education,recruitment_type,job_level,rating,awards,certifications,salary,gender,entry_date,last_raise,satisfied
0,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,False,False,False,False,False,False,False,False,False,False,False,False,True,False
496,False,False,False,False,False,False,False,False,False,False,False,False,True,False
497,False,False,False,False,False,False,True,False,False,False,False,False,True,False
498,False,False,False,False,False,False,False,False,False,False,False,False,True,False


#### Are there any missing values in the data?

In [37]:
check_nan = df.isnull().values.any()

# Return the boolean value true/false
print(check_nan)

True


### Renaming columns
There are 3 ways to rename columns:
#### 1. rename all columns by reassigning a list of new column names 

In [38]:
df_from_dictionary.columns = ["id", "first_name", "last_name","age"]
df_from_dictionary

Unnamed: 0,id,first_name,last_name,age
0,1,Shatha,Maayouf,24
1,2,ahmed,Fitouri,35
2,3,laila,Tili,10


#### 2. rename a single column using the "rename" method

In [39]:
df.rename(columns = {'Dept': 'department'}, inplace=True)

In case you want to keep the original df, you can apply the same method but omit the parameter ```inplace=True``` and save the result in a new variable, as follows:

``` df = df.rename(columns = {'Dept': 'department'})```

You can rename multiple columns by applying the same method as follows:

```df.rename(columns = {'age':'Age', 'emp_id':'id'}, inplace = True)```

#### 3. Rename column names by adding prefixes/sufixes using DataFrame add_prefix() and add_suffix() functions

You'll pass the prefix and suffix that should be added to the first and last name of the column name as follows: 

In [40]:
df_from_dictionary = df_from_dictionary.add_prefix('col_')
df_from_dictionary = df_from_dictionary.add_suffix('_1')
df_from_dictionary

Unnamed: 0,col_id_1,col_first_name_1,col_last_name_1,col_age_1
0,1,Shatha,Maayouf,24
1,2,ahmed,Fitouri,35
2,3,laila,Tili,10


### Inspecting Column values

Using the either of the following operations, we can see all distinct values in a column.

In [41]:
df['gender'].value_counts()

gender
m         207
f         187
Male       57
Female     46
Name: count, dtype: int64

In [42]:
df["gender"].unique()

array(['m', 'Male', 'f', 'Female', nan], dtype=object)

### Unifying Column values

The previous operations showed that in the `gender` column, the values are not coded consistently (`m`/`Male` and `f`/`Female`). We'll unify this column so that `m` becomes `Male` and `f` becomes `Female`. You can use the dictionary below `gender_recoding` for this.

In [13]:
gender_recoding = {
    "f": "Female",
    "m": "Male"
}

We'll unify using the `map`method as well as the `fillna`column to map the values in the column according to dictionary and then fill any resulting nans with the original values.

In [15]:
df['gender'] = df['gender'].map(gender_recoding).fillna(df['gender'])
df['gender']

0        Male
1        Male
2      Female
3      Female
4        Male
        ...  
495      Male
496    Female
497      Male
498      Male
499      Male
Name: gender, Length: 500, dtype: object