### Import Necessary Libraries

In [197]:
import pandas as pd
import matplotlib.pyplot as plt

### Reading Data from file

In [198]:
data = pd.read_csv('C:/Users/Vaibhav/Desktop/inotask1/raw_data_v11_3.csv')

In [199]:
# Current professions of Interns
current_profession = data['What is your current profession?'].value_counts()
print("Current profession of most interns:", current_profession)

Current profession of most interns: Student (I am currently enrolled in some Educational Program)                          488
Fresher (I have completed my Education and looking for a job now)                      424
Working Professional (I have completed my Education and working full time)              62
Freelancer (I have completed my Education and working part time or owns a business)     26
Name: What is your current profession?, dtype: int64


In [200]:
# determining the availabilty time of most intern for live session
live_session_timings = data['Please indicate the time that suits you for your daily LIVE interaction with our Chief Data Scientist as part of your internship.'].value_counts()
print("Preferred LIVE session timings:", live_session_timings)

Preferred LIVE session timings: 6:00 to 7:00 PM IST    672
5:00 to 6:00 PM IST    328
Name: Please indicate the time that suits you for your daily LIVE interaction with our Chief Data Scientist as part of your internship., dtype: int64


In [201]:
# level of experience or awareness of interns with data science
data_science_courses = data['Which of the following best describes you?'].value_counts()
print("Current course acquisition of interns:", data_science_courses)

Current course acquisition of interns: I am currently enrolled in a Data Science certification course    405
I have never enrolled in any certification courses                302
I have completed a Data Science certification course              293
Name: Which of the following best describes you?, dtype: int64


In [202]:
# Profession of intern with no enrollment in any certification courses
never_enrolled_profession = data[data['Which of the following best describes you?'] == 'I have never enrolled in any certification courses']['What is your current profession?'].value_counts().idxmax()
print("Most common profession of interns who have never enrolled in any certification courses:", never_enrolled_profession)

Most common profession of interns who have never enrolled in any certification courses: Student (I am currently enrolled in some Educational Program)


In [203]:
age_groups = data['Date of Birth'].apply(lambda x: x[:4])
age_groups = age_groups.value_counts()
print("Age groups of fellow interns:", age_groups)

Age groups of fellow interns: 10/2    33
11/2    32
12/2    30
10/1    28
11/1    28
        ..
3/17     1
1/25     1
3/1/     1
3/9/     1
6/3/     1
Name: Date of Birth, Length: 274, dtype: int64


In [204]:
# Standardize date formats in 'Date of Birth' column
data['Date of Birth'] = pd.to_datetime(data['Date of Birth'], errors='coerce')

age_series = (pd.to_datetime('today') - data['Date of Birth']).astype('<m8[Y]')

min_age = age_series.min()

print("Minimum age of an intern:", min_age)

Minimum age of an intern: -1.0


In [205]:

data['Date of Birth'] = pd.to_datetime(data['Date of Birth'], errors='coerce')

# Count the number of interns born on a Sunday
sunday_born = data['Date of Birth'].dt.dayofweek == 6
num_sunday_born = sunday_born.sum()
print("Number of interns born on a Sunday:", num_sunday_born)



Number of interns born on a Sunday: 125


In [206]:
# Find the month in which the majority of interns are born
birth_months = data['Date of Birth'].apply(lambda x: x.month)
majority_birth_month = birth_months.value_counts().idxmax()
print("Month in which the majority of interns are born:", majority_birth_month)

Month in which the majority of interns are born: 7


In [207]:
# Calculate the average age of working professionals
working_professionals = data[data['What is your current profession?'] == 'Working Professional (I have completed my Education and working full time)'].copy()

current_year = pd.to_datetime('today').year
working_professionals['Date of Birth'] = pd.to_datetime(working_professionals['Date of Birth'], errors='coerce')
working_professionals['Age'] = current_year - working_professionals['Date of Birth'].dt.year
average_age = working_professionals['Age'].mean()
print("Average age of a working professional:", average_age)


Average age of a working professional: 27.048387096774192


In [212]:
#Calculating max. age of freelancer
freelancers = data.loc[data['What is your current profession?'] == 'Freelancer (I have completed my Education and working part time or owns a business)', :].copy()
freelancers.loc[:, 'Date of Birth'] = pd.to_datetime(freelancers['Date of Birth'], format='%m-%d-%Y', errors='coerce')
freelancers.dropna(subset=['Date of Birth'], inplace=True)
max_age = (pd.to_datetime('today').year - freelancers['Date of Birth'].dt.year).max()
print("Maximum age of a freelancer:", max_age)


Maximum age of a freelancer: 44


  freelancers.loc[:, 'Date of Birth'] = pd.to_datetime(freelancers['Date of Birth'], format='%m-%d-%Y', errors='coerce')


In [213]:
# Split the skills and count the occurrences of each skill for data scientist
skills = data['What do you think are most important skill right now for a skilled Data Scientist?'].str.split(', ', expand=True).stack().value_counts()

skills_descending = skills.sort_values(ascending=False)
print("Skills in descending order:")
for skill, count in skills_descending.items():
    print(f"{skill}: {count}")


Skills in descending order:
Extracting actionable insights from the raw data: 869
Data Modelling: 825
MLOps: 596
LLMs: 533
Basics of backend application development using python: 522
Prompt Engineering: 425


In [209]:
# Convert the skills data to string format
data['What do you think are most important skill right now for a skilled Data Scientist?'] = data['What do you think are most important skill right now for a skilled Data Scientist?'].astype(str)

# Count the number of interns who selected "Prompt Engineering" as the most important skill
num_prompt_engineering = data['What do you think are most important skill right now for a skilled Data Scientist?'].str.contains('Prompt Engineering').sum()
print("Number of interns who selected 'Prompt Engineering' as the most important skill:", num_prompt_engineering)

Number of interns who selected 'Prompt Engineering' as the most important skill: 425


In [210]:
# Count the number of interns who did not mention the name of the data science institute
no_institute_mentioned = data['Please mention the name of the institute where you have completed or are currently pursuing your Data Science Certification Course.'].isna().sum()
print("Number of interns who did not mention the name of the data science institute:", no_institute_mentioned)

Number of interns who did not mention the name of the data science institute: 322


In [211]:
# Determining number of interns who mentioned "innomatics" in the institute names

# Convert the institute names to lowercase for case-insensitive matching
institute_names = data['Please mention the name of the institute where you have completed or are currently pursuing your Data Science Certification Course.'].str.lower()

num_innomatics_mentioned = institute_names.str.contains('innomatics').sum()
print("Number of interns who mentioned Innomatics as their data science institute:", num_innomatics_mentioned)


Number of interns who mentioned Innomatics as their data science institute: 278
