In [1]:
import numpy as np

import pandas as pd

import random


In [3]:
np.random.seed(42) # For reproducibility
n_samples = 100 # Number of samples
data = {
    'age': np.random.randint(18, 60, size=n_samples),
    'salary': np.random.randint(30000, 120000, size=n_samples),
    'department': np.random.choice(['IT', 'HR', 'Finance', 'Marketing'], size=n_samples),
    'years_experience': np.round(np.random.normal(5, 2, size=n_samples), 1),
    'is_manager': np.random.choice([0, 1], size=n_samples)
}
df = pd.DataFrame(data)

 Q1. View data structure


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               100 non-null    int64  
 1   salary            100 non-null    int64  
 2   department        100 non-null    object 
 3   years_experience  100 non-null    float64
 4   is_manager        100 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 4.0+ KB


Q2. Get DataFrame Info and Summary Stats

In [5]:
df.describe(include ='all')

Unnamed: 0,age,salary,department,years_experience,is_manager
count,100.0,100.0,100,100.0,100.0
unique,,,4,,
top,,,Marketing,,
freq,,,36,,
mean,37.91,77809.16,,4.823,0.47
std,12.219454,26058.643576,,2.237822,0.501614
min,18.0,30206.0,,-0.8,0.0
25%,26.75,55141.0,,3.475,0.0
50%,38.0,80932.0,,4.7,0.0
75%,46.25,98107.25,,6.0,1.0


Q3. Do Simple Numpy Operations

In [6]:
#Mean , Medain, Standard Deviation
# Mean salary
mean_salary = np.mean(df['salary'])
print("Number of managers:", mean_salary)

# Median age
median_age = np.median(df['age'])
print(f"Median employee age: {median_age} years")

Number of managers: 77809.16
Median employee age: 38.0 years


In [7]:
# Standard deviation of years of experience
std_experience = np.std(df['years_experience'])
print(f" Employee Experience: {std_experience} years Experience ")

#Aggregation and Conditions
# Count of managers
num_managers = np.sum(df['is_manager'])
print("Total number of managers are there in company:",num_managers)


 Employee Experience: 2.2266052636244265 years Experience 
Total number of managers are there in company: 47


In [10]:
# Employees with more than 10 years experience
experienced = df['years_experience'] > 10
num_experienced = np.sum(experienced)
print("\nNumber of employees with more thatn 10 years of experience in company:",num_experienced)




Number of employees with more thatn 10 years of experience in company: 1


In [9]:
#Random and array operation
salary_array = df['salary'].to_numpy()
print("\n Salary data about all employees:")
print(salary_array)
normalized_salary = (salary_array - np.min(salary_array)) / (np.max(salary_array) - np.min(salary_array))


 Salary data about all employees:
[ 38392  60535 108603  82256 119135  65222 107373 109575 114651  93335
  40965  54538 100592  38110 109309  57266  82992 112948  36910  30206
 117054 117897  53419  80636  80015  84268 117939  48141 110356 101910
  86044  97214  63827  85820  92623 111734 105450  52299  73585  94044
  72557  79080  32693  99163  55939  78925  72941  51834  48047  56105
 105766  45707  51976  74262  53776  60080  96842  91373  36776  85016
  39474  88053  51959  35530  33748  43545  96199  64766 103530  91087
  98840  84384  81005  76576  69353  92003 113211  82733  95318 119474
  53664  97172 115616  56736  30854  68623  37392  85680  76717 117092
  80859  56309 117455  93734 100467  82662  42688  55342  67157  97863]


In [11]:
#Random operation
print("\n Normalization Salary of employees")
print(normalized_salary)



 Normalization Salary of employees
[0.09170139 0.33975221 0.87822064 0.58307568 0.99620245 0.39225702
 0.86444191 0.8891092  0.94597168 0.70718511 0.12052471 0.27257248
 0.78847963 0.08854237 0.88612941 0.30313214 0.59132052 0.9268943
 0.0750997  0.         0.97289062 0.9823341  0.26003719 0.56492808
 0.5579715  0.60561455 0.98280459 0.20091186 0.89785814 0.80324416
 0.6255097  0.75063853 0.37662992 0.6230004  0.69920912 0.9132948
 0.84290003 0.2474907  0.48594121 0.71512748 0.47442533 0.54749742
 0.02785993 0.77247166 0.28826679 0.54576108 0.47872698 0.24228167
 0.19985885 0.29012636 0.84643993 0.17364565 0.24387238 0.49352512
 0.26403638 0.3346552  0.7464713  0.68520634 0.0735986  0.61399382
 0.1038222  0.64801497 0.24368195 0.05964063 0.03967827 0.14942645
 0.73926827 0.38714881 0.82139176 0.68200251 0.76885334 0.60691401
 0.5690617  0.51944706 0.43853341 0.69226375 0.92984048 0.58841914
 0.72939911 1.         0.26278174 0.75016803 0.95678183 0.29719496
 0.00725904 0.43035578 0.080

Q4. Filtering and Indexing Rows

In [12]:
# First 5 rows
df.iloc[:5]

df_indexed = df.set_index('department')
print(df_indexed )
df_indexed.loc['Finance']
print(df_indexed.loc)


            age  salary  years_experience  is_manager
department                                           
IT           56   38392              -0.8           0
Marketing    46   60535               3.4           1
HR           32  108603               5.0           1
HR           25   82256               4.2           1
HR           38  119135               4.1           1
...         ...     ...               ...         ...
IT           59   82662               4.0           0
HR           56   42688               4.4           1
Marketing    58   55342               6.0           0
HR           45   67157              11.4           0
HR           24   97863               5.2           1

[100 rows x 4 columns]
<pandas.core.indexing._LocIndexer object at 0x785f8b201680>


Q5. Adding a Column

In [13]:
df['company'] = 'TechCorp'
df.head()

Unnamed: 0,age,salary,department,years_experience,is_manager,company
0,56,38392,IT,-0.8,0,TechCorp
1,46,60535,Marketing,3.4,1,TechCorp
2,32,108603,HR,5.0,1,TechCorp
3,25,82256,HR,4.2,1,TechCorp
4,38,119135,HR,4.1,1,TechCorp


In [14]:
# Calculate salary per year of experience
df['salary_per_year'] = (df['salary'] / df['years_experience']).round(2)
df.head()

Unnamed: 0,age,salary,department,years_experience,is_manager,company,salary_per_year
0,56,38392,IT,-0.8,0,TechCorp,-47990.0
1,46,60535,Marketing,3.4,1,TechCorp,17804.41
2,32,108603,HR,5.0,1,TechCorp,21720.6
3,25,82256,HR,4.2,1,TechCorp,19584.76
4,38,119135,HR,4.1,1,TechCorp,29057.32


In [15]:
# Random performance score between 1 and 10
df['performance_score'] = np.random.randint(1, 11, size=len(df))
df.head()

Unnamed: 0,age,salary,department,years_experience,is_manager,company,salary_per_year,performance_score
0,56,38392,IT,-0.8,0,TechCorp,-47990.0,4
1,46,60535,Marketing,3.4,1,TechCorp,17804.41,9
2,32,108603,HR,5.0,1,TechCorp,21720.6,6
3,25,82256,HR,4.2,1,TechCorp,19584.76,3
4,38,119135,HR,4.1,1,TechCorp,29057.32,1


In [18]:
# Flag high earners
df['high_earner'] = df['salary'] > 100000
df.head()
#df.tail()

Unnamed: 0,age,salary,department,years_experience,is_manager,company,salary_per_year,performance_score,high_earner
0,56,38392,IT,-0.8,0,TechCorp,-47990.0,4,False
1,46,60535,Marketing,3.4,1,TechCorp,17804.41,9,False
2,32,108603,HR,5.0,1,TechCorp,21720.6,6,True
3,25,82256,HR,4.2,1,TechCorp,19584.76,3,False
4,38,119135,HR,4.1,1,TechCorp,29057.32,1,True


Q6. Grouping and Aggregation

In [22]:
df.groupby('department').size()

df.groupby('department')['salary'].mean()

df.groupby('department')['is_manager'].sum()

df.groupby('is_manager')['years_experience'].agg(['mean', 'min', 'max'])

Unnamed: 0_level_0,mean,min,max
is_manager,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5.25283,-0.8,11.4
1,4.338298,1.0,9.5
