In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score

  from numpy.core.umath_tests import inner1d


In [2]:
def reading_the_files():
    
    """reading the input files for the project"""
    # reading the input file
    data = pd.read_csv(r"C:\Users\Chandu Jagarlamundi\Desktop\Thesis_Wind data\Data Wind_extern\data_eng.csv")
    # reading the anlagestatus file, to get the details regarding anlage status
    status_data = pd.read_excel(r"C:\Users\Chandu Jagarlamundi\Desktop\Thesis_Wind data\Data Wind_extern\system_status.xlsx")      
    return data, status_data

In [3]:
files = reading_the_files()
data = files[0]
status_data = files[1]

In [4]:
def preprocessing(data):    
    # deleting the columns which have lots of missing values
    columns_data = data.columns
    for i in columns_data:
        if data[i].isna().sum()>(len(data)/2):
            print("The number of missing values in the column {} are {} ".format(i,data[i].isna().sum()))
            data.drop([i], axis = 1, inplace = True)
    columns_data = data.columns
    
    # deleting the columns which have single values
    for i in columns_data:
        if len(data[i].value_counts())<2:
            print("{} is eliminated".format(i))
            data.drop([i], axis = 1, inplace = True)
            
    data.Equipment = data.Equipment[data.Equipment!='Anlage']
    
    # dropping the missing rows
    data.dropna(axis = 0, inplace=True)
    data.drop(['Date(Remote)', 'Time(Remote)', 'Date(Server)', 'Time(Server)', "operating_state"], axis=1, inplace=True)
    data.Equipment = pd.Categorical(data.Equipment, categories=data.Equipment.unique()).codes
    # one hot encoding the Equipment feature
    data = pd.concat([data, pd.get_dummies(data.Equipment)], axis=1)
    data.drop(['Equipment'], axis=1, inplace=True)
    return data

In [5]:
data = preprocessing(data)

The number of missing values in the column apparent_power are 780813 
The number of missing values in the column generator_speed are 437865 
The number of missing values in the column nacelle_view are 780813 
The number of missing values in the column Digital _1 are 780813 
The number of missing values in the column Digital_2 are 780813 
The number of missing values in the column Three_phase_current_controller_Setpoint are 780813 
The number of missing values in the column wind_direction_deviation are 780813 
The number of missing values in the column Average_power_5_sec are 780813 
The number of missing values in the column Average_power_30_sec are 780813 
The number of missing values in the column switched_on_reactive_power are 780813 
The number of missing values in the column performance_class are 780813 
The number of missing values in the column Condition_Sheet are 780813 
The number of missing values in the column No_comp_levels are 780813 
Time_difference is eliminated
fast_rat

In [6]:
def splitting_valid_float(data, status_data):
    """ splitting the data into valid, invalid datasets and 
    then mapping the valid dataset with the status text"""
    # getting the true values of systemstatus
    common_status = np.intersect1d(status_data['Status_Number'], data['system_status'])
    # getting the valid data
    data_valid = data[data.system_status.isin(common_status)]
    data_float = data[~data.system_status.isin(common_status)]
    # converting the required data from the status_data into a dictionary    
    return data_valid, data_float

In [7]:
data_split = splitting_valid_float(data, status_data)
data_valid = data_split[0]
data_float = data_split[1]

In [8]:
def split_data_valid(data_valid):
    # obtaining the fraction of the data
    data_sample= data_valid.sample(frac=0.1).reset_index(drop=True)
    target_sample = data_sample['system_status']
    #####an error here###################
    data_sample.drop(['system_status'], axis=1, inplace=True)
    system_stats = pd.factorize(target_sample)
    target_sample = system_stats[0]
    train_x, val_x, train_y, val_y = train_test_split(
            data_sample, target_sample, test_size=0.33, random_state=42)
    return train_x, val_x, train_y, val_y 

In [9]:
split_data = split_data_valid(data)
train_x, val_x, train_y, val_y = split_data[0], split_data[1], split_data[2], split_data[3]

In [10]:
def processing_data_float(data_float):
    target_float = data_float['system_status']
    data_float.drop(['system_status'], axis=1, inplace=True)    
    return data_float, target_float 

In [11]:
float_data = processing_data_float(data_float)
data_float = float_data[0]
target_float = float_data[1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [12]:
def random_forrest(train_x, val_x, train_y, val_y , data_float, target_float):
    """applying the random forest for the set of data"""    
    classifier_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
    classifier_rf.fit(train_x, train_y)
    pred_y_rf = classifier_rf.predict(val_x)
    acc_rf_valid = accuracy_score(val_y, pred_y_rf)
    print('The accuracy of the random forest with the valid dataset is: {}'.format(acc_rf_valid))
    pred_float_rf = classifier_rf.predict(data_float)
    return pred_float_rf

In [13]:
rf = random_forrest(train_x, val_x, train_y, val_y, data_float, target_float)

The accuracy of the random forest with the valid dataset is: 0.9078487695054732


In [14]:
file = open('pred_float.txt', 'w')
for i in rf:
    file.write(str(i)+'\n')
file.close()
file = open('target_float.txt', 'w')
for i in target_float:
    file.write(str(i)+'\n')
file.close()

In [15]:
status = pd.DataFrame()
status["Status_number"] = status_data.Status_Number
status["status_text"] = status_data.status_text
status.dropna(axis=0, inplace=True)    
status_text = status.set_index("Status_number").T.to_dict('list')
# getting the details of the system status
data_valid['system_status'] = data_valid['system_status'].map(status_text)

  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [16]:
def whole_data_test(data):
    # obtaining the fraction of the data
    data_sample= data.sample(frac=0.1).reset_index(drop=True)
    print('The shape of data_sample is {}'.format(data_sample.shape))
    target_sample = data_sample['system_status']
    print('The shape of target_sample is {}'.format(target_sample.shape))
    data_sample.drop(['system_status'], axis=1, inplace=True)
    target_sample = target_sample.astype('str')
    train_x, val_x, train_y, val_y = train_test_split(
            data_sample, target_sample, test_size=0.30, random_state=42)
    classifier_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
    classifier_rf.fit(train_x, train_y)
    pred_y_rf = classifier_rf.predict(val_x)
    feat_importances = pd.Series(classifier_rf.feature_importances_, index=val_x.columns)
    feat_importances.nlargest(20).plot(kind='barh')
    return pred_y_rf, val_y

In [17]:
pred = whole_data_test(data)
pred_stats = pred[0]
target_sample = pred[1]

The shape of data_sample is (78064, 53)
The shape of target_sample is (78064,)


In [18]:
file = open('pred_float_data.txt', 'w')
for i in rf:
    file.write(str(i)+'\n')
file.close()
file = open('target_sample.txt', 'w')
for i in target_sample:
    file.write(str(i)+'\n')
file.close()

In [19]:
if pred_stats.shape[0] == target_sample.shape[0]:
    acc = accuracy_score(pred_stats, target_sample)
    print(acc)
else:
    print("not equal :(")

print(len(set(target_sample))-len(set(pred_stats)))

0.9061912894961571
122
