In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('/content/drive/MyDrive/DA-CaseStudies/Datasets/adult.csv')
data.head()

**1. Display Top 10 rows of the Dataset**

In [None]:
data.head(10)

**2. Check last 10 rows of the Dataset**

In [None]:
data.head(10)

**3. Find shape of our Dataset(Number of rows and Numbers of Columns)**

In [None]:
data.shape

In [None]:
data.shape[0] , data.shape[1]

**4. Getting Information about our Dataset like Total Number rows, Total number of columns, Datatypes of each columns and Memory requirement**


In [None]:
data.info()

**5. Fetch Random Samples From the Dataset(50%)**

In [None]:
data1 = data.sample(frac=0.50 , random_state=100)
data1

# frac :- portion of data
# random_state :- To fetch the sample

**6. Check Null values in the Dataset**

In [None]:
data.isnull().sum(axis=0)   # default columnwise

In [None]:
data.isnull().sum(axis=1)   #  rowwise

In [None]:
# Visualizing Null values

sns.heatmap(data.isnull())

**7. Perform Data Cleaning [Replace '?' with NaN]**

In [None]:
data.isin(['?']).sum(axis=0)

In [145]:
data.replace('?',np.nan,inplace=True)

In [55]:
# or
columns_with_marks = ['workclass','occupation','native-country']

data[columns_with_marks] = data[columns_with_marks].apply(lambda x : x.replace('?',np.nan ))

**8. Drop all the missing values**

In [None]:
data.isnull().sum(axis=0)

In [None]:
# Percentage of missing data
data.isnull().sum(axis=0)*100/len(data)

In [None]:
# Visualizing Null values

sns.heatmap(data.isnull())

In [None]:
# Dropping

data.dropna(inplace=True)

data.isnull().sum(axis=0)

**9. Check For Duplicate Data and Drop Them**

In [None]:
data.duplicated().any()

In [147]:
data.drop_duplicates(keep='first',inplace=True)

**10. Get Overall Statistics about the DataFrame**

In [None]:
data.describe()

**11.  Drop the Columns education-num , capital-gain and capital-loss**

Because educational-num has same value as education columns, also
capital-gain & capital-loss has 75% values 0.0 So, drop it

In [None]:
data.describe(include='all')

In [103]:
data.drop(['educational-num','capital-gain','capital-loss'],axis=1,inplace=True)

# Univariate Analysis
- Univariate Analysis is a type of data visualization where we visualize only a single variable at a time. Univariate Analysis helps us to analyze the distribution of the variable present in the data so that we can perform further analysis

**12. What is the Distribution of age Column ?**

In [None]:
data['age'].describe()

In [None]:
data['age'].hist()

In [None]:
sns.histplot(data['age'] , bins=25)
plt.show()

**13. Find Total Number of persons Having age Between 17 to 48(inclusive) Using Between Method**

In [None]:
data['age'].between(17,48).sum()

In [None]:
len(data[(data['age'] >= 17) & (data['age'] <= 48)])   #  always use parenthesis if conditions >= 2

**14. What is the Distribtion of workclass Column?**

In [None]:
data.describe()

In [None]:
data['workclass'].value_counts()

In [None]:
# Countplot
sns.countplot(data=data , x='workclass')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Histogram
sns.histplot(data['workclass'])
plt.xticks(rotation=90)
plt.show()

**15. How Many Persons Having Bachelors or Masters Degree?**

In [None]:
data['education'].value_counts()

In [None]:
data[(data['education']=='Bachelors') | (data['education']=='Masters')].shape[0]

In [None]:
data['education'].isin(['Bachelors','Masters']).sum()

# 16. Bivariate Analysis:
- It mean to visualize relationship b/w Two or more Variable.(like scatter plot , line plot , box plot)

In [None]:
# Income vs Age

sns.boxplot(data=data , x='income' , y='age')
plt.show()

In [None]:
sns.scatterplot(x = 'age' , y = 'fnlwgt' , data=data)

**17. Replace Salary Values  ['<=50k', '>=50'] with 0 and 1**

In [None]:
data['income'].unique()

In [None]:
data['income'].value_counts()

In [None]:
# visualizing
sns.countplot(data=data , x='income')
plt.show()

In [133]:
# Replacing using Apply

def income(x):
  if x == '<=50K':
    return 0
  else:
    return 1

data['encoded_income'] = data['income'].apply(income)

In [None]:
# Using Replace

data['income'].replace(to_replace=['<=50K','>50K'] ,value=[0,1] , inplace=True)

# or

data.replace({'income':{'<=50K' : 0 , '>50K' : 1}})

18. Which workclass Getting the highest Salary

In [None]:
data.groupby('workclass')['income'].mean().sort_values(ascending=False)

**19. How has Better Chance to Get Salary >=50k Male or Female**

In [None]:
data.groupby('gender')['income'].mean().sort_values(ascending=False)

**20. Convert workclass Columns Datatype to Category Datatype**

In [None]:
data.dtypes

In [159]:
data['workclass'] = data['workclass'].astype('category')

In [None]:
data['workclass'].info()

# By changing datatype we can optimized the Memory usage