## NLP task

### Import needed packages 

In [1]:
from transformers import pipeline
import pandas as pd
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import os
import subprocess
import numpy as np
import matplotlib.pyplot as plt
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import BartForConditionalGeneration, BartTokenizer
import torch
warnings.filterwarnings("ignore")

### Read the file via pandas

In [2]:
excel_file_path = f'C:/Users/sdany/Desktop/pilot project/pilotprtfinal.xlsx'
df = pd.read_excel(excel_file_path)

In [3]:
df.describe()

Unnamed: 0,Solicitation Year,Award Year,Award Amount,DUNS,Number Employees
count,1615.0,1615.0,1615.0,1593.0,1605.0
mean,2018.02291,2019.427864,584770.0,259644600.0,50.300312
std,7.252831,2.684268,1555689.0,315579500.0,76.233102
min,1900.0,2014.0,24900.0,0.0,0.0
25%,2016.0,2017.0,139860.5,79437490.0,6.0
50%,2019.0,2020.0,150000.0,116976200.0,17.0
75%,2021.0,2022.0,749999.8,188465800.0,57.0
max,2023.0,2023.0,55800000.0,969851500.0,468.0


In [4]:
## Labels and preprocess the text 

In [5]:
def change_labels(labels):
    label_map = {
        'Propulsion, motor, thruster, propplant': 'propulsion', 
        'Guidance, navigation, control, Attitude Determination': 'ACDS & GNC', 
        'System & Integration': 'System & Integration', 
        'Software': 'Software', 
        'Telecommunications': 'Communications', 
        'Sensors': 'Sensors',
        'Structure, Material & Mechanics': 'Mechanics',  
        'Command and Data Handling': 'C & DH',  
        'Power, energy': 'Power', 
        'Telemetry, Tracking and Command': 'TT & C'  
    }
    return label_map.get(labels, labels)

candidate_labels = [
    'Propulsion, motor, thruster, propplant', 
    'Guidance, navigation, control, Attitude Determination',
    'System & Integration',
    'Software', 
    'Telecommunications', 
    'Sensors',
    'Structure, Material & Mechanics',
    'Command and Data Handling', 
    'Power, energy', 
    'Telemetry, Tracking and Command'
]

# Function to preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    return text

df['Abstract'] = df['Abstract'].apply(preprocess_text)

#### Model using and defineing tasks : two model for two task : Summa.. & Classi...

In [6]:
task = "zero-shot-classification"
modelId = "facebook/bart-large-mnli"
classifier = pipeline(task=task, model=modelId)
#summarizer = pipeline(task="summarization", model="sshleifer/distilbart-cnn-12-6")




### Process part 

In [7]:
results_summarization = []
results_classification = []
predicted_labels = []

#### Summarization

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stopwords set
stopWords = set(stopwords.words("english"))


# Iterate through each abstract in the DataFrame
for abstract in tqdm(df['Abstract'], desc="Summarizing"):
    if len(word_tokenize(abstract)) > 512:
        # Tokenize words and create frequency table
        words = word_tokenize(abstract)
        freqTable = dict()
        for word in words:
            word = word.lower()
            if word not in stopWords:
                freqTable[word] = freqTable.get(word, 0) + 1
        
        # Tokenize sentences
        sentences = sent_tokenize(abstract)
        
        # Calculate sentence values
        sentenceValue = dict()
        for sentence in sentences:
            for word, freq in freqTable.items():
                if word in sentence.lower():
                    sentenceValue[sentence] = sentenceValue.get(sentence, 0) + freq
        
        # Calculate average sentence value
        sumValues = sum(sentenceValue.values())
        average = int(sumValues / len(sentenceValue))
        
        # Generate summary based on sentence values
        summary = ''
        for sentence in sentences:
            if sentenceValue.get(sentence, 0) > (1.5 * average):
                summary += " " + sentence
        
        # Append summary to results
        results_summarization.append(summary)
    else:
        # If the abstract is short, use it as is
        results_summarization.append(abstract)


[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     An existing connection was forcibly closed by the
[nltk_data]     remote host>
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sdany\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Summarizing: 100%|███████████████████████████████████████████████████████████████| 1615/1615 [00:01<00:00, 1245.42it/s]


In [10]:
df['Summarization']=results_summarization
df['Summarization']

0       abstract:to address air force (af) spacecraft ...
1       leveraging the successful proof-of-concept hyb...
2       in this proposed effort, rsa will develop a de...
3       the growing international interest in missions...
4       our adversaries have demonstrated cruise missi...
                              ...                        
1610    pretalen proposes a broadcast antenna technolo...
1611    since the first navstar gps satellite was laun...
1612    since the first navstar gps satellite was laun...
1613    current navy reentry flight test bodies use a ...
1614    current navy reentry flight test bodies use a ...
Name: Summarization, Length: 1615, dtype: object

#### Classifying

In [None]:
for abstract in tqdm(df['Research Keywords'], desc="Classifying"):
    try:
        result = classifier(abstract, candidate_labels)
        results_classification.append(result)
        max_score_index = result['scores'].index(max(result['scores']))
        predicted_labels.append(result['labels'][max_score_index])
        print(len(results_classification),len(predicted_labels),'\n')
    
    except Exception as e:
        print(f"Error processing abstract: {e}")
        results_classification.append(None)
        predicted_labels.append(None)
        print(len(results_classification),len(predicted_labels),'\n')

Classifying:   0%|                                                                  | 1/1615 [00:03<1:27:56,  3.27s/it]

1 1 



Classifying:   0%|                                                                  | 2/1615 [00:06<1:22:45,  3.08s/it]

2 2 



Classifying:   0%|                                                                  | 3/1615 [00:08<1:16:19,  2.84s/it]

3 3 



Classifying:   0%|▏                                                                 | 4/1615 [00:12<1:22:34,  3.08s/it]

4 4 



Classifying:   0%|▏                                                                 | 5/1615 [00:15<1:27:51,  3.27s/it]

5 5 



Classifying:   0%|▏                                                                 | 6/1615 [00:20<1:37:51,  3.65s/it]

6 6 



Classifying:   0%|▎                                                                 | 7/1615 [00:24<1:43:17,  3.85s/it]

7 7 



Classifying:   0%|▎                                                                 | 8/1615 [00:28<1:48:18,  4.04s/it]

8 8 



Classifying:   1%|▎                                                                 | 9/1615 [00:32<1:46:47,  3.99s/it]

9 9 



Classifying:   1%|▍                                                                | 10/1615 [00:36<1:47:44,  4.03s/it]

10 10 



Classifying:   1%|▍                                                                | 11/1615 [00:41<1:50:34,  4.14s/it]

11 11 



Classifying:   1%|▍                                                                | 12/1615 [00:45<1:48:40,  4.07s/it]

12 12 



Classifying:   1%|▌                                                                | 13/1615 [00:49<1:48:05,  4.05s/it]

13 13 



Classifying:   1%|▌                                                                | 14/1615 [00:53<1:46:33,  3.99s/it]

14 14 



Classifying:   1%|▌                                                                | 15/1615 [00:56<1:45:43,  3.96s/it]

15 15 



Classifying:   1%|▋                                                                | 16/1615 [01:01<1:53:12,  4.25s/it]

16 16 



Classifying:   1%|▋                                                                | 17/1615 [01:07<2:04:21,  4.67s/it]

17 17 



Classifying:   1%|▋                                                                | 18/1615 [01:11<1:59:22,  4.48s/it]

18 18 



Classifying:   1%|▊                                                                | 19/1615 [01:15<1:55:46,  4.35s/it]

19 19 



Classifying:   1%|▊                                                                | 20/1615 [01:19<1:53:05,  4.25s/it]

20 20 



Classifying:   1%|▊                                                                | 21/1615 [01:23<1:48:23,  4.08s/it]

21 21 



Classifying:   1%|▉                                                                | 22/1615 [01:26<1:41:31,  3.82s/it]

22 22 



Classifying:   1%|▉                                                                | 23/1615 [01:29<1:37:17,  3.67s/it]

23 23 



Classifying:   1%|▉                                                                | 24/1615 [01:33<1:39:36,  3.76s/it]

24 24 



Classifying:   2%|█                                                                | 25/1615 [01:37<1:36:23,  3.64s/it]

25 25 



Classifying:   2%|█                                                                | 26/1615 [01:42<1:46:49,  4.03s/it]

26 26 



Classifying:   2%|█                                                                | 27/1615 [01:46<1:52:38,  4.26s/it]

27 27 



Classifying:   2%|█▏                                                               | 28/1615 [01:51<1:56:14,  4.39s/it]

28 28 



Classifying:   2%|█▏                                                               | 29/1615 [01:55<1:53:58,  4.31s/it]

29 29 



Classifying:   2%|█▏                                                               | 30/1615 [01:59<1:51:14,  4.21s/it]

30 30 



Classifying:   2%|█▏                                                               | 31/1615 [02:04<1:52:43,  4.27s/it]

31 31 



Classifying:   2%|█▎                                                               | 32/1615 [02:09<2:00:35,  4.57s/it]

32 32 



Classifying:   2%|█▎                                                               | 33/1615 [02:15<2:14:39,  5.11s/it]

33 33 



Classifying:   2%|█▎                                                               | 34/1615 [02:19<2:02:20,  4.64s/it]

34 34 



Classifying:   2%|█▍                                                               | 35/1615 [02:23<2:00:01,  4.56s/it]

35 35 



Classifying:   2%|█▍                                                               | 36/1615 [02:27<1:55:15,  4.38s/it]

36 36 



Classifying:   2%|█▍                                                               | 37/1615 [02:31<1:53:07,  4.30s/it]

37 37 



Classifying:   2%|█▌                                                               | 38/1615 [02:35<1:51:18,  4.23s/it]

38 38 



Classifying:   2%|█▌                                                               | 39/1615 [02:39<1:46:56,  4.07s/it]

39 39 



Classifying:   2%|█▌                                                               | 40/1615 [02:43<1:46:49,  4.07s/it]

40 40 



Classifying:   3%|█▋                                                               | 41/1615 [02:48<1:53:20,  4.32s/it]

41 41 



Classifying:   3%|█▋                                                               | 42/1615 [02:52<1:46:59,  4.08s/it]

42 42 



Classifying:   3%|█▋                                                               | 43/1615 [02:55<1:43:31,  3.95s/it]

43 43 



Classifying:   3%|█▊                                                               | 44/1615 [03:00<1:48:42,  4.15s/it]

44 44 



Classifying:   3%|█▊                                                               | 45/1615 [03:04<1:48:51,  4.16s/it]

45 45 



Classifying:   3%|█▊                                                               | 46/1615 [03:07<1:43:20,  3.95s/it]

46 46 



Classifying:   3%|█▉                                                               | 47/1615 [03:11<1:40:29,  3.85s/it]

47 47 



Classifying:   3%|█▉                                                               | 48/1615 [03:15<1:41:12,  3.88s/it]

48 48 



Classifying:   3%|█▉                                                               | 49/1615 [03:19<1:45:16,  4.03s/it]

49 49 



Classifying:   3%|██                                                               | 50/1615 [03:23<1:41:20,  3.89s/it]

50 50 



Classifying:   3%|██                                                               | 51/1615 [03:27<1:41:00,  3.88s/it]

51 51 



Classifying:   3%|██                                                               | 52/1615 [03:31<1:42:09,  3.92s/it]

52 52 



Classifying:   3%|██▏                                                              | 53/1615 [03:35<1:41:16,  3.89s/it]

53 53 



Classifying:   3%|██▏                                                              | 54/1615 [03:39<1:44:26,  4.01s/it]

54 54 



Classifying:   3%|██▏                                                              | 55/1615 [03:44<1:52:18,  4.32s/it]

55 55 



Classifying:   3%|██▎                                                              | 56/1615 [03:49<1:57:30,  4.52s/it]

56 56 



Classifying:   4%|██▎                                                              | 57/1615 [03:53<1:51:55,  4.31s/it]

57 57 



Classifying:   4%|██▎                                                              | 58/1615 [03:57<1:49:20,  4.21s/it]

58 58 



Classifying:   4%|██▎                                                              | 59/1615 [04:02<1:55:42,  4.46s/it]

59 59 



Classifying:   4%|██▍                                                              | 60/1615 [04:04<1:41:26,  3.91s/it]

60 60 



Classifying:   4%|██▍                                                              | 61/1615 [04:09<1:44:52,  4.05s/it]

61 61 



Classifying:   4%|██▍                                                              | 62/1615 [04:12<1:35:43,  3.70s/it]

62 62 



Classifying:   4%|██▌                                                              | 63/1615 [04:15<1:30:04,  3.48s/it]

63 63 



Classifying:   4%|██▌                                                              | 64/1615 [04:19<1:35:28,  3.69s/it]

64 64 



Classifying:   4%|██▌                                                              | 65/1615 [04:23<1:37:53,  3.79s/it]

65 65 



Classifying:   4%|██▋                                                              | 66/1615 [04:27<1:40:12,  3.88s/it]

66 66 



Classifying:   4%|██▋                                                              | 67/1615 [04:32<1:46:08,  4.11s/it]

67 67 



Classifying:   4%|██▋                                                              | 68/1615 [04:36<1:48:42,  4.22s/it]

68 68 



Classifying:   4%|██▊                                                              | 69/1615 [04:39<1:42:13,  3.97s/it]

69 69 



Classifying:   4%|██▊                                                              | 70/1615 [04:42<1:32:36,  3.60s/it]

70 70 



Classifying:   4%|██▊                                                              | 71/1615 [04:46<1:32:53,  3.61s/it]

71 71 



Classifying:   4%|██▉                                                              | 72/1615 [04:50<1:36:34,  3.76s/it]

72 72 



Classifying:   5%|██▉                                                              | 73/1615 [04:53<1:31:36,  3.56s/it]

73 73 



Classifying:   5%|██▉                                                              | 74/1615 [04:58<1:40:52,  3.93s/it]

74 74 

Error processing abstract: object of type 'float' has no len()
75 75 



Classifying:   5%|███                                                              | 76/1615 [05:02<1:17:30,  3.02s/it]

76 76 



Classifying:   5%|███                                                              | 77/1615 [05:06<1:26:55,  3.39s/it]

77 77 



Classifying:   5%|███▏                                                             | 78/1615 [05:10<1:25:53,  3.35s/it]

78 78 



Classifying:   5%|███▏                                                             | 79/1615 [05:15<1:38:44,  3.86s/it]

79 79 



Classifying:   5%|███▏                                                             | 80/1615 [05:24<2:18:47,  5.42s/it]

80 80 



Classifying:   5%|███▎                                                             | 81/1615 [05:28<2:05:23,  4.90s/it]

81 81 



Classifying:   5%|███▎                                                             | 82/1615 [05:31<1:52:45,  4.41s/it]

82 82 



Classifying:   5%|███▎                                                             | 83/1615 [05:36<1:55:54,  4.54s/it]

83 83 



Classifying:   5%|███▍                                                             | 84/1615 [05:41<1:56:55,  4.58s/it]

84 84 



Classifying:   5%|███▍                                                             | 85/1615 [05:44<1:51:32,  4.37s/it]

85 85 



Classifying:   5%|███▍                                                             | 86/1615 [05:48<1:47:17,  4.21s/it]

86 86 



Classifying:   5%|███▌                                                             | 87/1615 [05:52<1:41:00,  3.97s/it]

87 87 



Classifying:   5%|███▌                                                             | 88/1615 [05:55<1:36:07,  3.78s/it]

88 88 



Classifying:   6%|███▌                                                             | 89/1615 [05:59<1:36:20,  3.79s/it]

89 89 



Classifying:   6%|███▌                                                             | 90/1615 [06:03<1:40:45,  3.96s/it]

90 90 



Classifying:   6%|███▋                                                             | 91/1615 [06:07<1:39:41,  3.92s/it]

91 91 



Classifying:   6%|███▋                                                             | 92/1615 [06:11<1:37:57,  3.86s/it]

92 92 



Classifying:   6%|███▋                                                             | 93/1615 [06:15<1:43:22,  4.08s/it]

93 93 



Classifying:   6%|███▊                                                             | 94/1615 [06:19<1:43:53,  4.10s/it]

94 94 



Classifying:   6%|███▊                                                             | 95/1615 [06:25<1:52:43,  4.45s/it]

95 95 



Classifying:   6%|███▊                                                             | 96/1615 [06:29<1:52:46,  4.45s/it]

96 96 



Classifying:   6%|███▉                                                             | 97/1615 [06:33<1:46:17,  4.20s/it]

97 97 



Classifying:   6%|███▉                                                             | 98/1615 [06:37<1:48:26,  4.29s/it]

98 98 



Classifying:   6%|███▉                                                             | 99/1615 [06:42<1:48:06,  4.28s/it]

99 99 



Classifying:   6%|███▉                                                            | 100/1615 [06:46<1:47:28,  4.26s/it]

100 100 



Classifying:   6%|████                                                            | 101/1615 [06:50<1:45:23,  4.18s/it]

101 101 



Classifying:   6%|████                                                            | 102/1615 [06:53<1:40:11,  3.97s/it]

102 102 



Classifying:   6%|████                                                            | 103/1615 [06:57<1:40:48,  4.00s/it]

103 103 



Classifying:   6%|████                                                            | 104/1615 [07:01<1:40:45,  4.00s/it]

104 104 



Classifying:   7%|████▏                                                           | 105/1615 [07:06<1:44:28,  4.15s/it]

105 105 



Classifying:   7%|████▏                                                           | 106/1615 [07:11<1:49:03,  4.34s/it]

106 106 



Classifying:   7%|████▏                                                           | 107/1615 [07:19<2:21:26,  5.63s/it]

107 107 



Classifying:   7%|████▎                                                           | 108/1615 [07:25<2:24:13,  5.74s/it]

108 108 



Classifying:   7%|████▎                                                           | 109/1615 [07:30<2:18:08,  5.50s/it]

109 109 



Classifying:   7%|████▎                                                           | 110/1615 [07:34<2:07:19,  5.08s/it]

110 110 



Classifying:   7%|████▍                                                           | 111/1615 [07:39<2:02:57,  4.91s/it]

111 111 



Classifying:   7%|████▍                                                           | 112/1615 [07:43<1:56:01,  4.63s/it]

112 112 



Classifying:   7%|████▍                                                           | 113/1615 [07:48<1:58:12,  4.72s/it]

113 113 



Classifying:   7%|████▌                                                           | 114/1615 [07:51<1:49:27,  4.38s/it]

114 114 



Classifying:   7%|████▌                                                           | 115/1615 [07:56<1:50:35,  4.42s/it]

115 115 



Classifying:   7%|████▌                                                           | 116/1615 [08:01<1:53:31,  4.54s/it]

116 116 



Classifying:   7%|████▋                                                           | 117/1615 [08:04<1:44:11,  4.17s/it]

117 117 



Classifying:   7%|████▋                                                           | 118/1615 [08:08<1:45:32,  4.23s/it]

118 118 



Classifying:   7%|████▋                                                           | 119/1615 [08:12<1:41:22,  4.07s/it]

119 119 



Classifying:   7%|████▊                                                           | 120/1615 [08:16<1:39:46,  4.00s/it]

120 120 



Classifying:   7%|████▊                                                           | 121/1615 [08:20<1:44:04,  4.18s/it]

121 121 



Classifying:   8%|████▊                                                           | 122/1615 [08:23<1:35:26,  3.84s/it]

122 122 



Classifying:   8%|████▊                                                           | 123/1615 [08:28<1:43:30,  4.16s/it]

123 123 



Classifying:   8%|████▉                                                           | 124/1615 [08:33<1:45:33,  4.25s/it]

124 124 



Classifying:   8%|████▉                                                           | 125/1615 [08:37<1:48:52,  4.38s/it]

125 125 



Classifying:   8%|████▉                                                           | 126/1615 [08:43<1:54:46,  4.63s/it]

126 126 



Classifying:   8%|█████                                                           | 127/1615 [08:47<1:54:49,  4.63s/it]

127 127 



Classifying:   8%|█████                                                           | 128/1615 [08:53<2:01:13,  4.89s/it]

128 128 



Classifying:   8%|█████                                                           | 129/1615 [08:58<2:00:16,  4.86s/it]

129 129 



Classifying:   8%|█████▏                                                          | 130/1615 [09:02<1:54:59,  4.65s/it]

130 130 



Classifying:   8%|█████▏                                                          | 131/1615 [09:05<1:47:06,  4.33s/it]

131 131 



Classifying:   8%|█████▏                                                          | 132/1615 [09:09<1:45:06,  4.25s/it]

132 132 



In [None]:
for label in candidate_labels:
    df[change_labels(label)] = [result['scores'][result['labels'].index(label)] if result and label in result['labels'] else 0 for result in results_classification]

In [None]:
df['Predicted Label'] = [change_labels(label) for label in predicted_labels] 


### Saving on output path

In [None]:
output_file_path = f'C:/Users/sdany/Desktop/pilot project/output_50jup.xlsx'
df.to_excel(output_file_path, index=False)

print(f"Output file path: {output_file_path}")

### Open Excel file

In [None]:
if os.name == 'nt':
    os.startfile(output_file_path)
elif os.name == 'posix':
    subprocess.call(['open', output_file_path])
else:
    subprocess.call(['xdg-open', output_file_path])

In [None]:
if os.name == 'nt':
    os.startfile(excel_file_path)
elif os.name == 'posix':
    subprocess.call(['open', excel_file_path])
else:
    subprocess.call(['xdg-open', excel_file_path])