In [2]:
import nltk
import random
import json
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score

In [3]:
intents = {
  "intents": [
    {
      "tag": "greeting",
      "patterns": ["Hi", "Hello", "Hey", "Good day", "How are you?"],
      "responses": ["Hello!", "Good to see you!", "Hi there, how can I help?"],
    },
    {
      "tag": "farewell",
      "patterns": ["Goodbye", "Bye", "See you later", "Talk to you later"],
      "responses": ["Sad to see you go :(", "Goodbye!", "Come back soon!"],

    },
    {
      "tag": "creator",
      "patterns": ["Who created you?", "Who is your developer?", "Who made you?"],
      "responses": ["I was created by Shivani & Arthiga"]

    },
    {
      "tag": "identity",
      "patterns": ["What is your name?", "What should I call you?", "Who are you?","What are you","Introduce Yourself"],
      "responses": ["You can call me Jarvis. I'm a Chatbot."]

    },
    
    {
      "tag": "casual_greeting",
      "patterns": ["What's up?", "How are you?", "How you doing?"],
       "responses": ["I'm here to assist you with any questions or information you need. How can I assist you today?"]

     },
    {
      "tag": "good_morning",
      "patterns": ["Good morning", "Morning"],
      "responses": ["Good morning! How can I assist you today?"]

     },
     {
       "tag": "good_afternoon",
       "patterns": ["Good afternoon", "Afternoon"],
        "responses": ["Good afternoon! How can I assist you today?"]

      },
      {
      "tag": "good_evening",
      "patterns": ["Good evening", "Evening"],
       "responses": ["Good evening! How can I assist you today?"]

         },
          {
        "tag": "thank_you",
        "patterns": ["Thank you", "Thanks"],
        "responses": ["You're welcome! If you have any more questions, feel free to ask."]

        },
       {
       "tag": "sorry",
      "patterns": ["Sorry", "Apologies"],
       "responses": ["No problem! If there's anything else you need assistance with, feel free to let me know."]

    },
    {
         "tag": "Total_Failures",
      "patterns": ["Total Failures","Today count of machine failure?","How many machines failed?"],
       "responses": ["Here are the results!"]
    },
    {
        "tag": "Total_Non_Failures",
      "patterns": ["Total Avalible machines","Today count of machine present?","How many machines available?"],
       "responses": ["Here are the results!"]
    }  ,
    { "tag": "Common_Failure_Conditions",
      "patterns": ["what are the conditions for failure","common failure conditions?"],
       "responses": ["Here are the results!"]
        
    },
    {
        "tag": "Average_Temperature",
      "patterns": ["what is the average temperature for failure","Average Temperature for failure"],
       "responses": ["Here are the results!"]
    },
    {
        "tag": "High_VOC",
      "patterns": ["what is the Failure Rate with High VOC "],
       "responses": ["Here are the results!"]
    },
    {
        "tag": "High_Footfall",
      "patterns": ["what is the Failure Rate with High Footfall? "],
       "responses": ["Here are the results!"]
    },
  {
       "tag": "CS_Level",
      "patterns": ["what is the Failure Rate with CS Level? "],
       "responses": ["Here are the results!"]
  },
  {
       "tag": "Poor_Air_Quality ",
      "patterns": ["what is the Failure Rate with Poor Air Quality ? "],
       "responses": ["Here are the results!"]
  },
  {
       "tag": "High_IP",
      "patterns": ["what is the Failure Rate with High IP? "],
       "responses": ["Here are the results!"]
  }
]
}

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rockstar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Function to perform synonym replacement
def synonym_replacement(tokens, limit):
    augmented_sentences = []
    for i in range(len(tokens)):
        synonyms = []
        for syn in wordnet.synsets(tokens[i]):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        if len(synonyms) > 0:
            num_augmentations = min(limit, len(synonyms))
            sampled_synonyms = random.sample(synonyms, num_augmentations)
            for synonym in sampled_synonyms:
                augmented_tokens = tokens[:i] + [synonym] + tokens[i+1:]
                augmented_sentences.append(' '.join(augmented_tokens))
    return augmented_sentences

In [6]:
text_data = []
labels = []
stopwords = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


limit_per_tag = 40

for intent in intents['intents']:
    augmented_sentences_per_tag = 0
    for example in intent['patterns']:
        tokens = nltk.word_tokenize(example.lower())
        filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords and token.isalpha()]
        if filtered_tokens:
            text_data.append(' '.join(filtered_tokens))
            labels.append(intent['tag'])
            
            augmented_sentences = synonym_replacement(filtered_tokens, limit_per_tag - augmented_sentences_per_tag)
            for augmented_sentence in augmented_sentences:
                text_data.append(augmented_sentence)
                labels.append(intent['tag'])
                augmented_sentences_per_tag += 1
                if augmented_sentences_per_tag >= limit_per_tag:
                    break

print(len(text_data))
print(len(labels))

717
717


In [7]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)
y = labels

In [14]:
def find_best_model(X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=100)


    models = [
        ('Logistic Regression', LogisticRegression(), {
            'penalty': ['l2'],
            'C': [0.1, 1.0, 10.0],
            'solver': ['liblinear'],
            'max_iter': [100, 1000, 10000]
        }),
        ('Multinomial Naive Bayes', MultinomialNB(), {'alpha': [0.1, 0.5, 1.0]}),
        ('Linear SVC', LinearSVC(), {
            'penalty': ['l2'],
            'loss': ['hinge', 'squared_hinge'],
            'C': [0.1, 1, 10],
            'max_iter': [100, 1000, 10000]
        }),
        ('Decision Tree', DecisionTreeClassifier(), {
            'max_depth': [5, 10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'criterion': ['gini', 'entropy']
        }),
        ('Random Forest', RandomForestClassifier(), {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        })
    ]

    for name, model, param_grid in models:
        grid = GridSearchCV(model, param_grid, cv=3, n_jobs=-1)
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        print(f'{name}: {score:.4f} (best parameters: {grid.best_params_})')

    best_model = max(models, key=lambda x: GridSearchCV(x[1], x[2], cv=3, n_jobs=-1).fit(X_train, y_train).score(X_test, y_test))
    print(f'\nBest model: {best_model[0]}')

    # Fit the best model to the full training data
    best_model[1].fit(X, y)

    return best_model[1]

In [15]:
best_model = find_best_model(X, y)

Logistic Regression: 0.9048 (best parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'})
Multinomial Naive Bayes: 0.4656 (best parameters: {'alpha': 0.1})
Linear SVC: 0.8995 (best parameters: {'C': 1, 'loss': 'hinge', 'max_iter': 10000, 'penalty': 'l2'})
Decision Tree: 0.9206 (best parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 5})
Random Forest: 0.9153 (best parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100})

Best model: Decision Tree


In [8]:
import pandas as pd

In [9]:
df=pd.read_csv('data.csv')
print(df.head())

   footfall  tempMode  AQ  USS  CS  VOC  RP  IP  Temperature  fail
0         0         7   7    1   6    6  36   3            1     1
1       190         1   3    3   5    1  20   4            1     0
2        31         7   2    2   6    1  24   6            1     0
3        83         4   3    4   5    1  28   6            1     0
4       640         7   5    6   4    0  68   6            1     0


In [3]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Assuming the target variable is 'fail' and features include the sensor readings and other metrics
X = df.drop('fail', axis=1)  # Feature variables
y = df['fail']  # Target variable (machine failure)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model (using GradientBoostingClassifier as an example)
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

# Test the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Functions to answer specific questions

# 1. Total count of machine failures
def total_failures():
    return df['fail'].sum()

# 2. Total count of machines that did not fail
def total_non_failures():
    return (df['fail'] == 0).sum()

# 3. Most common sensor readings associated with machine failure
def common_failure_conditions():
    failure_conditions = df[df['fail'] == 1].mode().iloc[0]
    return failure_conditions.to_dict()

# 4. Average temperature at the time of machine failure
def avg_temp_failure():
    return df[df['fail'] == 1]['Temperature'].mean()


# 6. Likelihood of failure with high VOC levels
def failure_with_high_voc(voc_threshold):
    high_voc_failures = df[(df['VOC'] > voc_threshold) & (df['fail'] == 1)]
    total_high_voc = df[df['VOC'] > voc_threshold]
    if len(total_high_voc) == 0:
        return 0
    return len(high_voc_failures) / len(total_high_voc) * 100

# 7. Effect of footfall on machine failure rates
def failure_rate_by_footfall(footfall_threshold):
    high_footfall_failures = df[(df['footfall'] > footfall_threshold) & (df['fail'] == 1)]
    total_high_footfall = df[df['footfall'] > footfall_threshold]
    if len(total_high_footfall) == 0:
        return 0
    return len(high_footfall_failures) / len(total_high_footfall) * 100

# 8. Failures when cooling system level is above 5
def failures_with_high_cs(cs_threshold):
    return len(df[(df['CS'] > cs_threshold) & (df['fail'] == 1)])

# 9. Relationship between air quality (AQ) and machine failure
def aq_failure_relation(aq_threshold):
    poor_aq_failures = df[(df['AQ'] > aq_threshold) & (df['fail'] == 1)]
    total_poor_aq = df[df['AQ'] > aq_threshold]
    if len(total_poor_aq) == 0:
        return 0
    return len(poor_aq_failures) / len(total_poor_aq) * 100

# 10. Effect of input power (IP) on failure probability
def failure_rate_by_ip(ip_threshold):
    high_ip_failures = df[(df['IP'] > ip_threshold) & (df['fail'] == 1)]
    total_high_ip = df[df['IP'] > ip_threshold]
    if len(total_high_ip) == 0:
        return 0
    return len(high_ip_failures) / len(total_high_ip) * 100




In [11]:
# Example usage:
print(f"Total Failures: {total_failures()}")
print(f"Total Non-Failures: {total_non_failures()}")
print(f"Common Failure Conditions: {common_failure_conditions()}")
print(f"Average Temperature at Failure: {avg_temp_failure():.2f}")
print(f"Failure Rate with High VOC (>50): {failure_with_high_voc(50):.2f}%")
print(f"Failure Rate by High Footfall (>200): {failure_rate_by_footfall(200):.2f}%")
print(f"Failures with CS Level > 5: {failures_with_high_cs(5)}")
print(f"Failure Rate with Poor Air Quality (AQ > 70): {aq_failure_relation(70):.2f}%")
print(f"Failure Rate by High IP (>500): {failure_rate_by_ip(500):.2f}%")

Total Failures: 393
Total Non-Failures: 551
Common Failure Conditions: {'footfall': 0, 'tempMode': 7, 'AQ': 6, 'USS': 2, 'CS': 6, 'VOC': 6, 'RP': 38, 'IP': 6, 'Temperature': 21, 'fail': 1}
Average Temperature at Failure: 17.68
Failure Rate with High VOC (>50): 0.00%
Failure Rate by High Footfall (>200): 29.52%
Failures with CS Level > 5: 224
Failure Rate with Poor Air Quality (AQ > 70): 0.00%
Failure Rate by High IP (>500): 0.00%


In [30]:
def chatbot_response(user_input):
    input_text = vectorizer.transform([user_input])
    predicted_intent = best_model.predict(input_text)[0]
    
    for intent in intents['intents']:
        if intent['tag'] == "Total_Failures":
            response=total_failures()
            break
        elif intent['tag'] == "Total_Non_Failures":
            response=total_non_failures()
            break
        elif intent['tag'] == "Common_Failure_Conditions":
            response=common_failure_conditions()
            break
        elif intent['tag'] == "Average_Temperature":
            response=avg_temp_failure()
            break
        elif intent['tag'] == "High_VOC":
            response=failure_with_high_voc(50)
            break
        elif intent['tag'] == "High_Footfall":
            response=failure_rate_by_footfall(200)
            break
        elif intent['tag'] == "CS_Level":
            response=failures_with_high_cs(5)
            break
        elif intent['tag'] == "Poor_Air_Quality":
            response=aq_failure_relation(70)
            break
        elif intent['tag'] == "High_IP":
            response=failure_rate_by_ip(500)
            break
        elif intent['tag'] == predicted_intent:
            response = random.choice(intent['responses'])
            break
            
    return response

In [32]:
user_input='hello'
response = chatbot_response(user_input)
print(response)

Hello!


In [35]:
user_input='Total Failures'
response = chatbot_response(user_input)
print(response)

393


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# Train the vectorizer on the full dataset
X_train = vectorizer.fit_transform(text_data)

# Model training
from sklearn.tree import DecisionTreeClassifier
best_model = DecisionTreeClassifier()
best_model.fit(X_train, labels)

In [23]:
print('Hello! I am a chatbot. How can I help you today? Type "quit" to exit.')
while True:
    user_input = input('>>> ')
    if user_input.lower() == 'quit':
        break
    response = chatbot_response(user_input)
    print(response)

Hello! I am a chatbot. How can I help you today? Type "quit" to exit.


In [1]:
print("Jarvis")

Jarvis


In [34]:
import os
import pickle


if not os.path.exists('model'):
    os.makedirs('model')

if not os.path.exists('dataset'):
    os.makedirs('dataset')

# Save the trained model
with open('model/chatbot_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save the vectorizer
with open('model/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save the intents to the "dataset" folder
with open('dataset/intents1.json', 'w') as f:
    json.dump(intents, f)