In [1]:
pip install pandas openpyxl pyspark




What are the first 5 records in the internship dataset?

In [17]:
from pyspark import SparkConf, SparkContext
import pandas as pd

# Use existing SparkContext if available, otherwise create a new one
sc = SparkContext.getOrCreate()
sc.setLogLevel("ERROR")

# Read Excel using pandas
df = pd.read_excel("internships.xlsx")

# Convert to list of rows
data_list = df.values.tolist()

# Create RDD
rdd = sc.parallelize(data_list)

print("First 5 records:")
for record in rdd.take(5):
    print(record)

# Note: sc.stop() is removed to allow reusing the SparkContext in other cells

First 5 records:
['Java Development', 'SunbaseData', 'Work From Home', 'Java, OOP, Spring, Problem Solving', 'Above 90%', 'Above 70%', 'Above 9.0']
['Accounting and Finance', 'DAKSM & Co. LLP', 'Noida', 'Accounting, Finance, MS Excel, Tally', 'Above 75%', 'Above 80%', 'Above 9.0']
['Sales & Digital Marketing', 'Bharat Natural Elements Private Limited', 'Bangalore', 'Sales, Digital Marketing, SEO, Communication', 'Above 80%', 'Above 70%', 'Above 8.5']
['Social Entrepreneurship', 'Hamari Pahchan NGO', 'Work From Home', 'NGO Management, Fundraising, Leadership', 'Above 90%', 'Above 90%', 'Above 9.0']
['Videography & Photography', 'Esquare Lifestyle', 'Bangalore', 'Videography, Photography, Editing, Creativity', 'Above 70%', 'Above 80%', 'Above 7.5']


Which internship title has the highest number of openings?

In [18]:
from pyspark import SparkConf, SparkContext
import pandas as pd

# Use existing SparkContext if available
sc = SparkContext.getOrCreate()
sc.setLogLevel("ERROR")

df = pd.read_excel("internships.xlsx")
data_rdd = sc.parallelize(df.values.tolist())

# Internship title is column 0
title_count = data_rdd.map(lambda x: (x[0], 1)).reduceByKey(lambda a, b: a + b)
max_title = title_count.sortBy(lambda x: -x[1]).first()

print(f"Internship title with highest openings: {max_title[0]} ({max_title[1]} openings)")

# Note: sc.stop() is removed to allow reusing the SparkContext in other cells

Internship title with highest openings: Human Resources (HR) (5 openings)


Which internship title has the lowest number of openings?

In [4]:
from pyspark import SparkConf, SparkContext
import pandas as pd

conf = SparkConf().setAppName("Internship_Q3").setMaster("local[*]")
sc = SparkContext(conf=conf)

df = pd.read_excel("internships.xlsx")
data_rdd = sc.parallelize(df.values.tolist())

# Company name is column 1
company_count = data_rdd.map(lambda x: (x[1], 1)).reduceByKey(lambda a, b: a + b)
min_company = company_count.sortBy(lambda x: x[1]).first()

print(f"Company with lowest internship openings: {min_company[0]} ({min_company[1]} openings)")

sc.stop()


Company with lowest internship openings: SunbaseData (1 openings)


What is the average cgpa by location ?

In [13]:
from pyspark import SparkConf, SparkContext
import pandas as pd
import re

# Use existing SparkContext if available, otherwise create a new one
sc = SparkContext.getOrCreate()
sc.setLogLevel("ERROR")

# Load the Excel file
# Correcting the file path to the uploaded file
df = pd.read_excel("internships.xlsx")

# Convert dataframe to RDD
data_rdd = sc.parallelize(df.values.tolist())

# Helper function to extract numeric CGPA from string
def extract_cgpa(value):
    try:
        # Use regex to get float number from string like "Above 9.0"
        match = re.search(r"\d+(\.\d+)?", str(value))
        if match:
            return float(match.group())
        else:
            return None
    except:
        return None

# Map location and CGPA, filtering invalid entries
location_cgpa = data_rdd.map(lambda x: (x[2], extract_cgpa(x[6]))) \
                        .filter(lambda x: x[1] is not None)

# Calculate average CGPA by location
avg_cgpa = location_cgpa.combineByKey(
    lambda value: (value, 1),
    lambda acc, value: (acc[0] + value, acc[1] + 1),
    lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
).mapValues(lambda x: round(x[0] / x[1], 2))

# Print results
print("Average CGPA by location:")
for loc, avg in avg_cgpa.collect():
    print(f"{loc}: {avg}")

# Note: sc.stop() is removed to allow reusing the SparkContext in other cells

Average CGPA by location:
Noida: 7.5
Delhi: 7.4
Mumbai: 7.0
Hyderabad: 7.5
Sachin INA: 6.5
Work From Home: 7.5
Bangalore: 7.79
Gurgaon: 7.5
Chennai: 6.0
Chandigarh: 8.0


Probability distribution of CGPA ranges ?

In [14]:
from pyspark import SparkConf, SparkContext
import pandas as pd
import re

# Use existing SparkContext if available
sc = SparkContext.getOrCreate()
sc.setLogLevel("ERROR")

df = pd.read_excel("internships.xlsx")
data_rdd = sc.parallelize(df.values.tolist())

# Helper function to extract numeric CGPA from string
def extract_cgpa(value):
    try:
        # Use regex to get float number from string like "Above 9.0"
        match = re.search(r"\d+(\.\d+)?", str(value))
        if match:
            return float(match.group())
        else:
            return None
    except:
        return None

# CGPA is column 6
# Map to extract CGPA and filter out invalid entries
cgpa_rdd = data_rdd.map(lambda x: extract_cgpa(x[6])).filter(lambda x: x is not None)

total = cgpa_rdd.count()
above_8 = cgpa_rdd.filter(lambda x: x > 8.0).count()
between_6_8 = cgpa_rdd.filter(lambda x: 6.0 <= x <= 8.0).count()
below_6 = cgpa_rdd.filter(lambda x: x < 6.0).count()

# Calculate probabilities, handling the case where total is 0
p_above_8 = round(above_8 / total, 3) if total > 0 else 0
p_between_6_8 = round(between_6_8 / total, 3) if total > 0 else 0
p_below_6 = round(below_6 / total, 3) if total > 0 else 0

print("Probability distribution:")
print(f"P(CGPA > 8.0)     = {p_above_8}")
print(f"P(6.0 ≤ CGPA ≤ 8.0) = {p_between_6_8}")
print(f"P(CGPA < 6.0)     = {p_below_6}")

# Note: sc.stop() is removed to allow reusing the SparkContext in other cells


Probability distribution:
P(CGPA > 8.0)     = 0.265
P(6.0 ≤ CGPA ≤ 8.0) = 0.735
P(CGPA < 6.0)     = 0.0
