## Feature Engineering-4

In [1]:
# Q1. Pearson correlation coefficient is a measure of the linear relationship between two variables. Suppose
# you have collected data on the amount of time students spend studying for an exam and their final exam
# scores. Calculate the Pearson correlation coefficient between these two variables and interpret the result.

# Ans:

# Import the package
import pandas as pd

# Make the dataset
data = pd.DataFrame(
    {
        "time_spend": [2, 4, 6, 2, 4, 8, 10, 4],
        "score": [60, 70, 80, 68, 86, 92, 98, 84]
    }
)

p_corr = data.corr(method= 'pearson')
print(p_corr)

            time_spend     score
time_spend    1.000000  0.869713
score         0.869713  1.000000


In [2]:
# Interpretation:
# There is a highly positive corelation present in between time_spend and score according to the dataset.
# So, we can say that if student would spend more time on study then their score will improve.

In [3]:
# Q2. Spearman's rank correlation is a measure of the monotonic relationship between two variables.
# Suppose you have collected data on the amount of sleep individuals get each night and their overall job
# satisfaction level on a scale of 1 to 10. Calculate the Spearman's rank correlation between these two
# variables and interpret the result.

# Import packages
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

# Creating synthetic data for Sleep Hours and Job Satisfaction
np.random.seed(42)  # For reproducibility
data = {
    "Sleep Hours": np.random.uniform(4, 9, 10),  # Sleep hours between 4 to 9
    "Job Satisfaction": np.random.randint(1, 11, 10)  # Job Satisfaction from 1 to 10
}

# Creating DataFrame
df = pd.DataFrame(data)

# Calculating Spearman's Rank Correlation
spearman_corr, p_value = spearmanr(df["Sleep Hours"], df["Job Satisfaction"])

# Display DataFrame and correlation result
df, spearman_corr


(   Sleep Hours  Job Satisfaction
 0     5.872701                 6
 1     8.753572                 5
 2     7.659970                 2
 3     6.993292                 8
 4     4.780093                 6
 5     4.779973                 2
 6     4.290418                 5
 7     8.330881                 1
 8     7.005575                10
 9     7.540363                 6,
 -0.19756474308710295)

In [4]:
# Interpretation:
# The correlation coefficient (ρ) is -0.198, which indicates a weak negative correlation between Sleep Hours and 
# Job Satisfaction.
# This means that as sleep hours increase, job satisfaction tends to decrease slightly, but the relationship is weak.

In [5]:
# Q3. Suppose you are conducting a study to examine the relationship between the number of hours of
# exercise per week and body mass index (BMI) in a sample of adults. You collected data on both variables
# for 50 participants. Calculate the Pearson correlation coefficient and the Spearman's rank correlation
# between these two variables and compare the results.


# Ans:

# Import packages
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr


# Creating synthetic data for exercise per week and BMI
np.random.seed(42)  # For reproducibility
data = {
    "Exercise Per Week": np.random.uniform(1, 14, 50),  # Exercise Per Week, hours between 1 to 14
    "BMI": np.random.randint(15, 40, 50)  # BMI index starting from 15 to 40
}

# Creating DataFrame
df = pd.DataFrame(data)


# Calculate Spearman's rank correlation
spearman_corr, spearman_p_value = spearmanr(df['Exercise Per Week'], df['BMI'])

# Calculate Pearson's correlation
pearson_corr, pearson_p_value = pearsonr(df['Exercise Per Week'], df['BMI'])

print("spearman_corr:", spearman_corr, "\n\npearson_corr:", pearson_corr)


spearman_corr: -0.10330660689994277 

pearson_corr: -0.077426401817155


In [6]:
# Both of the correlation coefficients are negative and so close to 0 that means they have a very weak negative
# correlation.

In [7]:
# Q4. A researcher is interested in examining the relationship between the number of hours individuals
# spend watching television per day and their level of physical activity. The researcher collected data on
# both variables from a sample of 50 participants. Calculate the Pearson correlation coefficient between
# these two variables.

# Ans:


# Import packages
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

# Creating synthetic dataset
np.random.seed(42)
data = {
    'watching television per day': np.random.uniform(0, 10, 50),
    'physical activity': np.random.randint(1, 11, 50) # 1 is less active and 10 means highly active
}

df = pd.DataFrame(data)

p_corr, p_value = pearsonr(df['watching television per day'], df['physical activity'])

print(p_corr)

0.06563469824484601


In [8]:
# Q5. A survey was conducted to examine the relationship between age and preference for a particular
# brand of soft drink. The survey results are shown below:
# Age(Years): [25, 42, 37, 19, 31, 28], 
# Soft drink Preference: ['coke', 'pepsi', 'Mountain dew', 'coke', 'pepsi', 'coke']

# Ans:

# Import packages
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

# Given data
age = [25, 42, 37, 19, 31, 28]
soft_drink = ['coke', 'pepsi', 'Mountain dew', 'coke', 'pepsi', 'coke']

# Encoding soft drink preferences
drink_mapping = {'coke': 1, 'pepsi': 2, 'Mountain dew': 3}
soft_drink_numeric = [drink_mapping[drink] for drink in soft_drink]


df_survey = pd.DataFrame({'Age': age, 'Soft Drink (Numeric)': soft_drink_numeric})

# Calculating Spearman's Rank Correlation
spearman_corr_survey, p_value_survey = spearmanr(df_survey['Age'], df_survey['Soft Drink (Numeric)'])

# Display DataFrame and correlation result
df_survey, spearman_corr_survey


(   Age  Soft Drink (Numeric)
 0   25                     1
 1   42                     2
 2   37                     3
 3   19                     1
 4   31                     2
 5   28                     1,
 0.8332380897952965)

In [9]:
# Q6. A company is interested in examining the relationship between the number of sales calls made per day
# and the number of sales made per week. The company collected data on both variables from a sample of
# 30 sales representatives. Calculate the Pearson correlation coefficient between these two variables.

# Ans:

import pandas as pd
import numpy as np

# Generating synthetic data for 30 sales representatives
np.random.seed(42)  # For reproducibility

# Number of sales calls per day (between 10 to 50)
sales_calls_per_day = np.random.randint(10, 51, 30)

# Number of sales per week (correlated with sales calls, adding some randomness)
sales_per_week = sales_calls_per_day * 0.6 + np.random.normal(0, 5, 30)

# Creating DataFrame
df_sales = pd.DataFrame({'Sales Calls per Day': sales_calls_per_day, 'Sales per Week': sales_per_week})

# Calculating Pearson's correlation coefficient
pearson_corr = df_sales.corr().iloc[0, 1]

# Display result
print("Pearson Correlation Coefficient:", pearson_corr)


Pearson Correlation Coefficient: 0.8488529231577153
