In [None]:
! pip install factor-analyzer

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
import statsmodels.stats.outliers_influence as sms

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split , KFold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix

In [None]:
from factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer import calculate_kmo
from factor_analyzer import FactorAnalyzer

In [None]:
data_02 = pd.read_csv("https://raw.githubusercontent.com/UM1412/Data-Set/main/FactorAnalysisTrain.csv")

# Factor Analysis

Considering only columns including numbers rated on a Likert scale.

In [None]:
data_01 = data_02.iloc[:, 8:22]

In [None]:
data_01.sample(10)

## Factorability

### 01)Bartlett's Test of Sphericity

**The two primary tests commonly used to assess the suitability of a dataset for Factor Analysis are Bartlett's Test of Sphericity and the Kaiser-Meyer-Olkin (KMO) Test.**

Bartlett's Test of Sphericity is a tool used by researchers to determine the appropriateness of employing factor analysis to uncover the underlying structure within their data. By assessing the interrelatedness of variables through their correlation or covariance, the test assists in discerning whether these variables exhibit significant dependencies. If the test suggests that the variables are not independent, it provides support for conducting factor analysis to unveil latent factors within the dataset.

In [None]:
chi_square, p_value = calculate_bartlett_sphericity(data_01)

print("Chi-Square Statistic: ", chi_square)
print("P-value: ", p_value)

**Interpretation**
<br/>The P-value is lower than 0.05, which means that this dataset is suitable for the Factor Analysis!

### 02) KMO-test

The Kaiser-Meyer-Olkin (KMO) measure is a statistical tool employed to evaluate the suitability of a dataset for factor analysis. It assesses the extent to which variables in the dataset share common variance, which is crucial for factor analysis. The KMO measure quantifies the proportion of variance among variables that is shared, providing insight into whether the dataset meets the fundamental assumption of factor analysis.

In [None]:
kmo_all, kmo_model = calculate_kmo(data_01)

print("KMO for All Variables:", kmo_all)
print("KMO for Model:", kmo_model)

**Interpretation**
<br/>The average Kaiser-Meyer-Olkin (KMO) measure for the variables exceeds 0.6, and each variable individually also has a KMO score above 0.6.
<br/>Based on these results, I concluded that this dataset is appropriately suited for the Factor Analysis.

## Standardization

In [None]:
scaler = StandardScaler()
data_03 = scaler.fit_transform(data_01)

## Principal Component Analaysis

In [None]:
PCA = PCA()
PCA.fit(data_03)

##  Deciding Number of Factor

Decide on the number of principal components to retain. This decision can be based on the cumulative explained variance (usually aiming for a high cumulative variance, e.g., 65-90%) or by using criteria such as the Kaiser criterion (retain components with eigenvalues greater than 1) or scree plot inspection.

In [None]:
result = pd.DataFrame({
    "Eigen": PCA.explained_variance_,
    "Variance_ratio":PCA.explained_variance_ratio_ * 100,
    "CumulativeVariance": (PCA.explained_variance_ratio_ * 100).cumsum()
})
result.index = ['comp ' + str(i+1) for i in result.index]

result

**Interpretation**

Since cumulative explained variance of Component 4 is greated than 65% we will extract 5 Components. Also Kaiser criterion Component 4 with eigenvalues greater than 1.

In [None]:
a = PCA.explained_variance_
num_components = len(a)
plt.figure(figsize=(15,15))
plt.plot(np.arange(1, 15), a, marker="*", linestyle="-")
plt.title('Scree Plot')
plt.xlabel('Principal Component Index')
plt.ylabel('Eigenvalue')
plt.ylabel('Eigenvalue')
plt.grid(True)
plt.show()

Point at Component 4 shows elbow.Thus we can conclude Components before and incuding 4 capture most of the variance in the data, while components after the elbow capture less significant variance.

## Factor Analysis Type

### Rotation == None

In [None]:
Factor_01 = FactorAnalyzer(n_factors=4, rotation=None)
Factor_01.fit(data_03)

In [None]:
laoding_01 = pd.DataFrame(Factor_01.loadings_, index=data_01.columns, columns=[f'Factor{i+1}' for i in range(4)])
laoding_01

### Rotation == Promax

In [None]:
Factor_02 = FactorAnalyzer(n_factors=4, rotation="promax")
Factor_02.fit(data_03)

In [None]:
laoding_02 = pd.DataFrame(Factor_02.loadings_, index=data_01.columns, columns=[f'Factor{i+1}' for i in range(4)])
laoding_02

### Rotation == Quartimax

In [None]:
Factor_03 = FactorAnalyzer(n_factors=4, rotation="quartimax")
Factor_03.fit(data_03)

In [None]:
laoding_03 = pd.DataFrame(Factor_03.loadings_, index=data_01.columns, columns=[f'Factor{i+1}' for i in range(4)])
laoding_03

### Rotation == Varimax

In [None]:
Factor_04 = FactorAnalyzer(n_factors=4, rotation="varimax")
Factor_04.fit(data_03)

In [None]:
laoding = pd.DataFrame(Factor_04.loadings_, index=data_01.columns, columns=[f'Factor{i+1}' for i in range(4)])
laoding

**Factor Making**


In assigning names to each factor, we considered the three primary elements exhibiting the highest loading scores.

Factor 1 is labeled "Inflight Comfort & Quality" due to its strong association with aspects enhancing the quality of time spent inside the aircraft.
- Cleanliness (0.854)
- Food and drink (0.77)
- Inflight entertainment (0.766)

Factor 2 is denoted as "Customer Service Quality" since it primarily encompasses components linked to the provision of high-quality service throughout the entire journey, from boarding to arrival.
- Inflight service (0.799)
- Baggage handling (0.76)
- On-board service (0.7)

Factor 3 is titled "Convenience and Efficiency" as it predominantly reflects elements aimed at optimizing time efficiency during the entire travel experience.
- Ease of online booking (0.766)
- Gate location (0.68)
- Inflight Wi-Fi service (0.605)

The fourth and final factor is named "Technological Accessibility" owing to its strong association with technological advancements facilitating convenient access to flight-related services.
- Online boarding (0.7565)
- Inflight Wi-Fi service (0.478)
- Ease of online booking (0.463)

In [None]:
data_04 = Factor_04.transform(data_01)

In [None]:
data_05 = pd.DataFrame()

In [None]:
data_05['Inflight Comfort and Quality'] = data_04[:, 0]
data_05['Customer Service Quality'] = data_04[:, 1]
data_05['Convenience and Efficiency'] = data_04[:, 2]
data_05['Technological Accessibility'] = data_04[:, 3]

In [None]:
data_05

In [None]:
factor_loadings = {
    "Inflight Comfort & Quality": {"Cleanliness": 0.854, "Food and drink": 0.77, "Inflight entertainment": 0.766},
    "Customer Service Quality": {"Inflight service": 0.799, "Baggage handling": 0.76, "On-board service": 0.7},
    "Convenience and Efficiency": {"Ease of online booking": 0.766, "Gate location": 0.68, "Inflight wifi service": 0.605},
    "Technological Accessibility": {"Online boarding": 0.7565, "Inflight wifi service": 0.478, "Ease of online booking": 0.463}
}
factor_loadings_df = pd.DataFrame.from_dict(factor_loadings, orient='index')
factor_loadings_df = factor_loadings_df.transpose()
plt.figure(figsize=(10, 6))
sns.heatmap(factor_loadings_df, annot=True, cmap="YlGnBu", cbar=False)
plt.title('Factor Loadings')
plt.xlabel('Factors')
plt.ylabel('Variables')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Logastics Regression

### Label Encoding

In [None]:
label_encoder = LabelEncoder()

In [None]:
data_05["Result"] = label_encoder.fit_transform(data_02['satisfaction'])

In [None]:
data_05.info()

### Appropriate outcome type

In [None]:
print(data_02['satisfaction'].value_counts())

In [None]:
print(data_05["Result"].value_counts())

**Interpretation**

There are only two outcomes (i.e. binary classification of "neutral or dissatisfied" or did "satisfied"), so we will be using Binary Logistic Regression (which is the default method we use when we specify family=Binomial in our logit models earlier)


### Multicollinearity

In [None]:
def calc_vif(df):
    vif = pd.DataFrame()
    vif["variables"] = df.columns
    vif["VIF"] = [sms.variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif

calc_vif(data_05[['Inflight Comfort and Quality', 'Customer Service Quality','Convenience and Efficiency', 'Technological Accessibility']])

**Decision Rule**

The threshold for VIF is usually 5 (i.e. values above 5 means there is presence of multicollinearity)

(Since all the variables have VIF < 5, it means that there is no multicollinearity, and this assumption is satisfied)

In [None]:
Y = data_05["Result"]
X = data_05[['Inflight Comfort and Quality', 'Customer Service Quality','Convenience and Efficiency', 'Technological Accessibility']]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

In [None]:
Y_pred = logreg.predict(X_test)

In [None]:
Y_test

In [None]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, Y_test)))

In [None]:
confusion_matrix = confusion_matrix(Y_test, Y_test)
print(confusion_matrix)

In [None]:
logit_roc_auc = roc_auc_score(Y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
print("The End")