# BUSINESS PROBLEM-1
BACKGROUND: The Lending Club is a peer-to-peer lending site where members make loans to
each other. The site makes anonymized data on loans and borrowers publicly available.

In [24]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway, pearsonr, ttest_ind, stats

In [14]:
# Load the data
loans_data = pd.read_csv('C:/Users/ADMIN/Documents/hypothesis_data/LoansData.csv')

# Replace periods with underscores in column names
loans_data.rename(columns=lambda x: x.replace('.', '_'), inplace=True)

# Preprocess the data by removing rows with missing or invalid values
loans_data = loans_data.dropna(subset=['Loan_Length', 'Interest_Rate','FICO_Range'])

# Verify the updated column names
print(loans_data.columns)

Index(['Amount_Requested', 'Amount_Funded_By_Investors', 'Interest_Rate',
       'Loan_Length', 'Loan_Purpose', 'Debt_To_Income_Ratio', 'State',
       'Home_Ownership', 'Monthly_Income', 'FICO_Range', 'Open_CREDIT_Lines',
       'Revolving_CREDIT_Balance', 'Inquiries_in_the_Last_6_Months',
       'Employment_Length'],
      dtype='object')


In [9]:
# Hypothesis a: Interest rate varied for different loan amounts
loan_amounts = loans_data['Amount_Requested']
interest_rates = loans_data['Interest_Rate'].str.rstrip('%').astype(float)

f_statistic, p_value = f_oneway(*[interest_rates[loan_amounts == amount] for amount in loan_amounts.unique()])
if p_value < 0.05:
    print("Hypothesis a: Reject the null hypothesis")
else:
    print("Hypothesis a: Fail to reject the null hypothesis")

Hypothesis a: Reject the null hypothesis


In [11]:
# Hypothesis b: Loan length directly affecting interest rate
# Preprocess the data by removing rows with missing or invalid values


loan_lengths = loans_data['Loan_Length'].str.rstrip(' months').astype(float)
interest_rates = loans_data['Interest_Rate'].str.rstrip('%').astype(float)

correlation_coefficient, p_value = pearsonr(loan_lengths, interest_rates)
if p_value < 0.05:
    print("Hypothesis b: Reject the null hypothesis")
else:
    print("Hypothesis b: Fail to reject the null hypothesis")

Hypothesis b: Reject the null hypothesis


In [12]:
# Hypothesis c: Interest rate varies for different purposes of loans
loan_purposes = loans_data['Loan_Purpose']

f_statistic, p_value = f_oneway(*[interest_rates[loan_purposes == purpose] for purpose in loan_purposes.unique()])
if p_value < 0.05:
    print("Hypothesis c: Reject the null hypothesis")
else:
    print("Hypothesis c: Fail to reject the null hypothesis")

Hypothesis c: Reject the null hypothesis


In [15]:
# Hypothesis d: Relationship between FICO scores and home ownership
fico_scores = loans_data['FICO_Range'].str.split('-', expand=True)[0].astype(float)
home_ownership = loans_data['Home_Ownership']


fico_home_owners = fico_scores[home_ownership == 'OWN']
fico_non_home_owners = fico_scores[home_ownership != 'OWN']

t_statistic, p_value = ttest_ind(fico_home_owners, fico_non_home_owners)
if p_value < 0.05:
    print("Hypothesis d: Reject the null hypothesis")
else:
    print("Hypothesis d: Fail to reject the null hypothesis")

Hypothesis d: Fail to reject the null hypothesis


# BUSINESS PROBLEM-2

In [20]:
# Read the CSV file
price_quotes_data = pd.read_csv('C:/Users/ADMIN/Documents/hypothesis_data/Price_Quotes.csv')

# Extract the price quotes for Mary and Barry
mary_quotes = price_quotes_data['Mary_Price']
barry_quotes = price_quotes_data['Barry_Price']

# Perform a two-sample t-test
t_statistic, p_value = ttest_ind(mary_quotes, barry_quotes)

# Interpret the results
alpha = 0.05

if p_value < alpha:
    print("There is a significant difference in the average price quotes between Mary and Barry.")
else:
    print("There is no significant difference in the average price quotes between Mary and Barry.")


There is no significant difference in the average price quotes between Mary and Barry.


# BUSINESS PROBLEM-3

In [22]:
# Load the data
Treatment_facility_data = pd.read_csv('C:/Users/ADMIN/Documents/hypothesis_data/Treatment_Facility.csv')
Treatment_facility_data = Treatment_facility_data.rename(columns={'VAR4': 'TRFF(%)', 'VAR5': 'CI(%)'})

# Separate the data for pre- and post-reengineering periods
pre_reengineering_data = Treatment_facility_data[Treatment_facility_data['Reengineer'] == 'Prior']
post_reengineering_data = Treatment_facility_data[Treatment_facility_data['Reengineer'] == 'Post']

# Calculate the critical incidence rate for pre-reengineering period
pre_critical_incidence_rate = (pre_reengineering_data['CI(%)'].mean())

# Calculate the critical incidence rate for post-reengineering period
post_critical_incidence_rate = (post_reengineering_data['CI(%)'].mean())

# Perform a two-sample t-test
t_statistic, p_value = ttest_ind(pre_reengineering_data['CI(%)'], post_reengineering_data['CI(%)'])

# Interpret the results
alpha = 0.05

if p_value < alpha:
    print("There is evidence that the critical incidence rate improved after the reengineering effort.")
else:
    print("There is no significant evidence that the critical incidence rate improved after the reengineering effort.")

# Print the critical incidence rates for pre- and post-reengineering periods
print("Pre-Reengineering Critical Incidence Rate:", pre_critical_incidence_rate)
print("Post-Reengineering Critical Incidence Rate:", post_critical_incidence_rate)


There is no significant evidence that the critical incidence rate improved after the reengineering effort.
Pre-Reengineering Critical Incidence Rate: 53.887906321846145
Post-Reengineering Critical Incidence Rate: 23.34971927988571


# BUSINESS PROBLEM-4

In [25]:
# Load the data
Priority_data = pd.read_csv('C:/Users/ADMIN/Documents/hypothesis_data/Priority_Assessment.csv')

# Calculate the average completion time for each priority level
avg_completion_time = Priority_data.groupby('Priority')['Days'].mean()

# Perform one-way ANOVA test
result = stats.f_oneway(Priority_data[Priority_data['Priority'] == 'High']['Days'],
                        Priority_data[Priority_data['Priority'] == 'Medium']['Days'])

# Interpret the results
alpha = 0.05

if result.pvalue < alpha:
    print("There is a significant difference in average completion time between different priority levels.")
else:
    print("There is no significant difference in average completion time between different priority levels.")

# Print the average completion time for each priority level
print("Average Completion Time by Priority:")
print(avg_completion_time)

There is no significant difference in average completion time between different priority levels.
Average Completion Time by Priority:
Priority
High      3.023620
Low       4.228358
Medium    2.500000
Name: Days, dtype: float64


  result = stats.f_oneway(Priority_data[Priority_data['Priority'] == 'High']['Days'],


# BUSINESS PROBLEM-5

In [27]:
# Load the data
Films_data = pd.read_csv('C:/Users/ADMIN/Documents/hypothesis_data/Films.csv')

# Customer Satisfaction Analysis
satisfaction_cols = ['Sinage', 'Parking', 'Clean', 'Overall']
satisfaction_scores = Films_data[satisfaction_cols].mean()

overall_satisfaction = satisfaction_scores['Overall']

print("Overall Customer Satisfaction:", overall_satisfaction)

# Factors Linked to Satisfaction
factors_cols = ['Gender', 'Marital_Status', 'Age', 'Income']
factors_satisfaction = Films_data.groupby(factors_cols)[satisfaction_cols].mean()

print("Factors Linked to Satisfaction:")
print(factors_satisfaction)

# Demographic Profile Analysis
demographic_cols = ['Gender', 'Marital_Status', 'Age', 'Income']
demographic_profile = Films_data[demographic_cols].value_counts(normalize=True)

print("Demographic Profile:")
print(demographic_profile)

# Effective Media Outlet Analysis
media_outlet_counts = Films_data['Hear_About'].value_counts(normalize=True)

print("Media Outlets:")
print(media_outlet_counts)

Overall Customer Satisfaction: 1.6189024390243902
Factors Linked to Satisfaction:
                                    Sinage   Parking     Clean   Overall
Gender Marital_Status Age Income                                        
1      1              2.0 1.0     1.750000  1.750000  1.250000  1.750000
                          2.0     2.500000  2.000000  1.500000  1.000000
                          3.0     1.666667  2.000000  1.666667  1.666667
                      3.0 1.0     1.000000  2.000000  2.000000  1.000000
                          2.0     1.833333  2.000000  1.833333  1.500000
                          3.0     1.714286  1.642857  1.571429  1.571429
                      4.0 3.0     2.000000  1.000000  1.000000  1.000000
       2              1.0 1.0     2.333333  2.666667  2.333333  2.000000
                          3.0     1.000000  1.000000  1.000000  1.000000
                      2.0 1.0     2.344828  1.931034  1.862069  1.655172
                          2.0     2.153846