<a href="https://colab.research.google.com/github/RogerJL/LTU/blob/main/eMaintenance/machine_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Submit your solutions in pdf format, with code and plots supporting your answers.
machine_data contains raw data of a part from 3 manufactures A, B, C
The system is run to failure under load
The load and the operation time is provided in each row

What is the range of load and time during operation for each manufacturer?
What is the most expected load value?
How are the load and time related?
Which distribution best describes the load?
Which distribution best describes the time?

Which manufacturer has the best performance and why?


In [65]:
%matplotlib notebook

from math import sqrt

import numpy as np
import pandas as pd

import matplotlib
import lets_plot as lplt
import matplotlib.pyplot as plt
#matplotlib.use("TkAgg")

from scipy.stats import weibull_min, norm, uniform, expon


In [66]:
# read the data file into a dataframe
df = pd.read_csv('machine_data.csv')
print("machine_data", df.shape)


machine_data (1229, 4)


Drop the index

In [67]:
df.drop(columns='Unnamed: 0', inplace=True)
df.loc[df.manufacturef == 'c', 'manufacturef'] = 'C'
df.loc[:, 'manufacturer'] = df.manufacturef
df.drop(columns='manufacturef', inplace=True)
print(df.columns)

Index(['time', 'load', 'manufacturer'], dtype='object')


In [68]:
minimum = df.min()
maximum = df.max()
mean = df.mean(numeric_only=True)
std = df.std(numeric_only=True)


Extract data for a given manufacturer

In [69]:
lplt.LetsPlot.setup_html()

#lplt.ggplot(data, lplt.aes(x='rating', fill='cond')) + lplt.ggsize(700, 300) + \
#    lplt.geom_density(color='dark_green', alpha=.7) + lplt.scale_fill_brewer(type='seq') + \
#    lplt.theme(panel_grid_major_x='blank')
classes = ['A', 'B', 'C']
bounds_df = pd.DataFrame([(cl, df[df.manufacturer == cl].load.mean()) for cl in classes], \
                         columns=['c', 'load_mean'])
print(bounds_df)

(lplt.ggplot(df, lplt.aes(x='load', fill='manufacturer'))
 + lplt.ggsize(700, 300)
 + lplt.geom_histogram(position='dodge', alpha=0.7)
# + lplt.geom_density(color='dark_green', alpha=.7)
 + lplt.scale_fill_brewer(type='seq')
 + lplt.theme(panel_grid_major_x='blank')
 + lplt.geom_vline(lplt.aes(xintercept=bounds_df.load_mean, color=bounds_df.c),
                   linetype="dashed"))
#lplt.show()

   c  load_mean
0  A  74.497625
1  B  74.686092
2  C  74.376665


In [None]:

grpByManu = df.groupby(['manufacturer'])

fig, axs = plt.subplots(1 + 3, 2, sharex=False, sharey=False)
for ax in axs[:, 0]:
    ax.set(xlim=(minimum['load'], maximum['load']))
for ax in axs[:, 1]:
    ax.set(xlim=(minimum['time'], maximum['time']))
    
ax, axs = axs[0,0], axs[1:, :]
ax.set(ylim=(minimum['time'], maximum['time']))
ax.set_title("Relation between load and time")
ax.set_xlabel("Load")
ax.set_ylabel("Time")

#time_range = np.linspace(weibull_min.ppf(0.01, c=c, loc=loc, scale=scale),
#                weibull_min.ppf(0.99, c=c, loc=loc, scale=scale),
#                100)
time_range = np.linspace(minimum['time'], maximum['time'], 200)
load_range = np.linspace(minimum['load'], maximum['load'], 200)

for index, (name, dfa) in enumerate(grpByManu):
    name = "Manufacturer " + name[0]

    load = dfa['load']
    time = dfa['time']

    #%%
    '''
    Is there a relationship between load and time
    '''
    path_collection = ax.scatter(load, time)
    path_collection.set_label(name)

    #%%
    '''
    Characteristics of data
    mean, median, mode
    '''
    print(f"{name}, load size={load.size} mean={load.mean()}, median={load.median()}, mode={load.round().mode()}")
    #%%
    '''
    How is load distributed?
    Why does it matter?
    uniform, normal, exponential, weibull?
    '''
    '''
    I would assume load to have normal distribution, as it is something physical sampled
    Other distributions can have various skewnesses and kurtosis built in
    If it is not a normal distribution I would question measurements!
    '''
    bins = 20
    n, bins, patches = axs[index, 0].hist(load, bins=bins, label=name)
#    load_hist.set_title(f"Histogram of load distribution")
    load_mean, load_std = norm.fit(load)

    def load_cdf(range):
        return load.size * norm.cdf(range, load_mean, load_std)
    
    def plot_cdf(ax, range, cdf):
        left = range[:-1]
        right = range[1:]
        ax.plot((left + right)/2, (cdf(right) - cdf(left)))

    plot_cdf(axs[index, 0], bins, load_cdf)
    #%%
    '''
    variance, standard deviation
    What is the meaning of 6sigma
    '''
    print(f"{name}, load var={dfa['load'].var()}, stddev={dfa['load'].std()}")
    #%%
    '''
    Other plots that can be useful 
    boxplot
    '''
    bins = 20
    n, bins, patches = axs[index, 1].hist(time, bins=bins, label=name)
    # time_hist.set_title(f"Histogram of time distribution")

    c, loc, scale = weibull_min.fit(time)
    
    def time_cdf(range):
        return time.size * weibull_min.cdf(range, c=c, loc=loc, scale=scale)
    
    plot_cdf(axs[index, 1], bins, time_cdf)
    
ax.legend()
plt.show()
