In [15]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import plotly.io as pio

In [16]:
# Read in data
airplane_df = pd.read_csv('../data/airplanes_final.csv')

In [17]:
#create a correlation matrix
corr = airplane_df.corr()





In [18]:
train_data = pd.read_csv('../data/train_final.csv')

In [19]:
#Remove unwanted columns
train_data = train_data.drop(['CASKLDRR', 'CASINJRR', 'CARSDMG','CARSHZD', 'Latitude', 'Longitud'], axis=1)


In [20]:
#rename columns temp to Temperature
train_data = train_data.rename(columns={'TEMP': 'Temperature', 'TRNSPD': 'Speed', 'ACCDMG': 'Damage'})

In [21]:
corr_train = train_data.corr()





In [22]:

# Create the interactive corrplot using Plotly
fig = go.Figure()

# Add the initial correlation plot for airplane data
fig.add_trace(go.Heatmap(
    z=corr.values,
    x=list(corr.columns),
    y=list(corr.columns),
    colorscale='Blues'
))

# Define the dropdown menu
dropdown = go.layout.Updatemenu(
    buttons=list([
        dict(
            args=[{'z': [corr.values], 'x': [list(corr.columns)], 'y': [list(corr.columns)]}],
            label='Airplane',
            method='update'
        ),
        dict(
            args=[{'z': [corr_train.values], 'x': [list(corr_train.columns)], 'y': [list(corr_train.columns)]}],
            label='Train',
            method='update'
        )
    ]),
    direction='down',
    showactive=True,
    active=0,
)

# Add the dropdown menu to the figure layout
fig.update_layout(
    updatemenus=[dropdown],
    title='Correlation Matrix for Airplane and Train Data',
    width=700,
    height=500
)
#save figure
pio.write_html(fig, file='../img/correlation/correlation.html', auto_open=False)
# Show the figure
fig.show()


In [23]:
# Preprocess the Summary column using CountVectorizer
cv = CountVectorizer(stop_words='english', max_features=1000)
X = cv.fit_transform(airplane_df['Summary'].fillna('').values)
words = cv.get_feature_names()

# Create a new DataFrame with the word counts
word_counts = pd.DataFrame(X.toarray(), columns=words)

# Compute the correlation matrix
corr_words = word_counts.corr()


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



In [24]:
word_counts

Unnamed: 0,000,10,100,11,12,13,14,15,150,16,...,wings,wires,witness,witnesses,wooded,wreckage,wrong,yards,year,york
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1358,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1359,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1360,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1361,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Calculate the total frequency for each word
word_counts_total = word_counts.sum()

# Sort the words by frequency in descending order
top_words = word_counts_total.sort_values(ascending=False).head(10).index
top_words = top_words.tolist()

top_words = word_counts[top_words]
top_words["Fatalities"] = airplane_df["Fatalities"]

# Create the correlation matrix
corr_words = top_words.corr()

# Create the interactive correlation plot using Plotly
fig = px.imshow(corr_words,
                color_continuous_scale='Blues')

fig.update_layout(
    width=600,
    height=600,
    title='Correlation Matrix for Most Used words and Fatalities'
)
pio.write_image(fig, file='../img/correlation/corr_matrix_airplane.png', format='png')
fig.show()






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [26]:
# Preprocess the Summary column using CountVectorizer
cv = CountVectorizer(stop_words='english', max_features=50)
X = cv.fit_transform(train_data['DESCRIPTION'].fillna('').values)
words_train = cv.get_feature_names()

# Create a new DataFrame with the word counts
word_counts_train = pd.DataFrame(X.toarray(), columns=words_train)


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



In [27]:
# Calculate the total frequency for each word
word_counts_total_train = word_counts_train.sum()

# Sort the words by frequency in descending order
top_words_train = word_counts_total_train.sort_values(ascending=False).head(10).index
top_words_train = top_words_train.tolist()

# Filter the word_counts DataFrame to include only the most frequent words
top_words_train = word_counts_train[top_words_train]
top_words_train["Damage"] = train_data["Damage"]

# Create the correlation matrix
corr_words_train = top_words_train.corr()

# Create the interactive correlation plot using Plotly
fig = px.imshow(corr_words_train,
                color_continuous_scale='Blues')

fig.update_layout(
    width=600,
    height=600,
    title='Correlation Matrix for Most Used words and Damage'
)
pio.write_image(fig, file='../img/correlation/corr_matrix_train.png', format='png')
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [28]:

# Create the interactive corrplot using Plotly
fig = go.Figure()

# Add the initial correlation plot for airplane data
fig.add_trace(go.Heatmap(
    z=corr_words.values,
    x=list(corr_words.columns),
    y=list(corr_words.columns),
    colorscale='Blues'
))

# Define the dropdown menu
dropdown = go.layout.Updatemenu(
    buttons=list([
        dict(
            args=[{'z': [corr_words.values], 'x': [list(corr_words.columns)], 'y': [list(corr_words.columns)]}],
            label='Airplane',
            method='update'
        ),
        dict(
            args=[{'z': [corr_words_train.values], 'x': [list(corr_words_train.columns)], 'y': [list(corr_words_train.columns)]}],
            label='Train',
            method='update'
        )
    ]),
    direction='down',
    showactive=True,
    active=0,
    x=0.9,
    xanchor='left',
    y=1.2,
    yanchor='top'
)

# Add the dropdown menu to the figure layout
fig.update_layout(
    updatemenus=[dropdown],
    title='Most Used words in Airplane Crash Summaries <br>And Train Accident Description',
    width=750,
    height=600
)

#save figure
pio.write_html(fig, file='../img/correlation/correlation_words.html', auto_open=False)

# Show the figure
fig.show()
